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int k ; 

int address ; 

int *iiKlxs = 0 ; 

int skiplt ; 
S int numHits = 0 ; 

unsigned char **1)itAnay ; 

int size ; 

int what ; 

int this ; 
10 InitO; 

if ( !( bitArray — (unsigned char 

**)UTL_MH4_CALLOC(bitset- > num VariationSites, 



sizeof(undgned char ^)) ) 
IS goto AddTraceback ; 

for ( i = 0 ; i < bitset-> num VariationSites ; i++ ) 

{ 

** We only want to count the fragments for the site that is being exploded. 
20 */ 

if ((site != -1 )&&(i != site) ) 
continue ; 

size = ( bitset->actuallSizes[i] + 7 } / 8 ; 
if ( !(bitAnay[i] = (unsigned char 
25 •)UTL_MEM_CALLOC(aze,Mzeof(unsigned char)))) 

goto AddTraceback ; 

} 

for ( address - -1 , i = 0 ; i < bitset->totaISelected ; i++ ) 
{ 

30 address = IHBFindNextOne(bitset- > bitset,address-t- 1); 

BitSetAddressToIndexes(bitset,address,&indxs,0); 

/* 

** The sites that have already been expanded will constraint what hits 
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** we find, 
*/ 

if ( numFixedSites ) 
{ 

5 skipit = 0 ; 

for ( k = 0 ; k < bitset->numVariationSites ; k+-l- ) 

{ 

if ( fixedSitesIndexestkl == -I ) 
continue ; 

10 /* 

our hit index nuitches our constraint index. 

♦/ 

if ( fixedSitesIndexes[k] ! = tndxs[k] ) 

{ 

15 skipit = 1 ; 

break; 

} 

} 

if ( skipit ) 
20 continue ; 

} 

numHits+-f ; 

for ( j = 0 ; j < bitset->numVariationSiles ; j++ ) 

{ 

25 if ( ( site != -1 ) ( j != site ) ) 

continue ; 
what = indxs(j] % 8; 
this = indxsQ] / 8; 
bitAnaylj][this] { = setbits[what]; 

30 } 



} 



#ifO 
/• 
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** Figure out how many hits there are for this site. 
♦/ 

for ( k = 0 ; k < bitset->numVariationSites ; k+-f- ) 
{ 

5 if( site!= k) 

continue ; 

size = (-bitset->actuallSizes[k] + 7 ) / 8 ; 
numFragmaitsPeiSite[k] = 0 ; 
for ( j = 0 ; j < size ; j++ ) 
10 nuniFragmentsPerSite[k] + = nbits[bitArray[k][j] & 255]; 

} 

#endif 
/* 

Now get the indexes for all the hits. 

15 */ 

if ( !( (*hitlndexes) = (int *)UTL_MEM XALLOC(numHits. 



sizeof(int))) 



) 

goto AddTraceback ; 
20 numHits = 0 ; 

for ( k = 0 ; k < bitset*>numVariationSites ; k++ ) 

{ 

if (site == -1 ) 
{ 

25 if ( fixedSite$Indexes[k] != -1 ) 

continue ; 

} 

else 

{ 

30 if(site!=k) 

continue ; 

} 

siz = ( bitset- > actuailSizes[k] + 7 ) / 8 ; 
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for ( i = 0 ; i < size ; i++ ) 

{ 

if ( bitAnayPcKq ) 
{ 

5 /* 

** If any bit is set in the byte then we need to figure out what the hits are. 
*/ 

for(j = 0;j < 8 ;j++ ) 
{ 

10 if ( bitAnay[k](i] & seAitsQ] ) 

{ 



(*hitIndexes)[numHits++] = i • 8 + j ; 

} 



} 



15 } 

} 

} 

for ( i = 0 ; i < bitset->numVariationSites ; i++ ) 
if ( bitArray[i] ) 
20 UTL_MEM_FREE(bitAnay[i]); 
UTL^MEM_FREE(bitAnay ); 
return numHits ; 

AddTracd>ack : 
return 0 ; 

25 } 



static int GetPartialProductsStats( struct BitSetFileStruct n>itset , int numFixedSites, int 
"'fixedSitesIndexes, int *numProducts, int *numFragmentsPerSite) 

{ 

int i ; 
30 int j ; 



wo 97/27559 PCT/US97A)1491 

553 

int k ; 

int address ; 

int *indxs = 0 ; 

int skipit ; 
5 int numHits 0 ; 

unsigned char *^itArray ; 

int size ; 

int what ; 

int this ; 
10 InitO; 

if ( !( bitArray = (unsigned char 

**)UTL_MEM_CALLOC(bitset- > numVariationSites, 

sizeof(uns]gned char *))) ) . 
15 goto AddTracebaicIc ; 

for ( i = 0 ; i < bitset-> numVariationSites ; i+-h ) 

{ 

We only want to count the fragments for the sites that are not being 
20 ** exploded. 
*/ 

if ( fixedSitesIndexes[i] -1 ) 
continue ; 

size = ( bitset->actuallSizes[iI + 7 ) / 8 ; 
25 if ( !(bitArFay[i] = (unsigned char 

*)UTL_MEM_CALLOC(Mze,sizeof(unsigned char)))) 

goto AddTraceback ; 

> 

for ( address = -1 , i = 0 ; i < bitset->totalSel6cted ; i++ ) 
30 { 

address = IHBFindNextOne(bitset->bitset,address-M); 
BitSetAddressToIndexes(bits^,address,&indxs,0); 
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The sites that have already been expanded will constraint what hits 
** we find, 
*/ 

if ( numFixedSites ) 
5 { 

ddplt = 0 ; 

for ( k = 0 ; k < bitset->numVariationSites ; k++ ) 
{ 

if ( fixedSitesIndexes[k] - -1 ) 
10 continue ; 

our hit index matches our constraint index. 

*/ 

if ( fixedSitesIndexes[k] != indxs[k] ) 
15 { 

skipit = I ; 
break; 

} 

20 if ( skipit ) 

continue ; 

} 

numHits++ ; 

for ( j = 0 ; j < bitset->numVariationSites ; j++ ) 
25 { 

if ( fixedSitesIndexesQ] != -1 ) 

continue ; 
what = indxsUl % 8; 
this - indxsQ] / 8; 
30 bitArray(j][this] j = setbits[what]; 

} 

} 

for ( k = 0 ; k < bitset- > num VariationSites ; k++ ) 
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{ 



if ( ftxedSitesIndexes[k] != -1 ) 
continue ; 

size = ( bitset->actiialISizes[k] + 7 ) / 8 ; 
numFragmentsPerSite[k] = 0 ; 
for (j = 0 ; j < size ; j++ ) 

nuniFragmentsPerSiteM += nbils[bitArray[k][j] & 255]; 



} 

♦numProducts = numHits ; 
10 for ( i = 0 ; i < bitset->numVariationSites ; i++ ) 

if ( bitAnayfi] ) 

UTL_MEM_^FREE(bitAiTay[i]); 
UTL_Mai_FREE(bitArray ); 
return numHits ; 

15 AddTraceback : 
return 0 ; 

} 

static int GetPartialProducts( struct BitSetFileStruct *bitset , int numFixedSites, int 

"^fixedSitesIndexes, int whichSite, int **sitelndexes) 
20 { 

int i ; 

intj; 

int k ; 

int address ; 
25 int *indxs = 0 ; 

int skipit ; 

int numHits = 0 ; 
InitO; 

for ( address = -1 . i = 0 ; i < bitset- > totalSelected ; i-h+ ) 
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{ 

address = IHBFindNextOne(bitset- > bits^,address + 1 ) ; 
BitSetAddressToIndexes(bitset.address,&indxs,0); 

/* 

5 The sites diat have already been expanded will constraint what hits 
** we find. 
*/ 

if ( numFixedSites ) 
{ 

10 skipit = 0 : 

for ( k = 0 ; k < bltset->numVariationSites ; k++ ) 
{ 

if ( flxedSitesIndexesOc] == -1 ) 
continue ; 

15 /* 

** our hit index matches our constraint index. 
*/ 

if ( fixedSitesIndexes[k] ! = indxs[k] ) 
{ 

20 skipit = 1 ; 

break; 

} 

} 

* 

if ( skipit ) 
25 continue ; 

} 

numHits++ ; 

fprintf(stderr,"Got a hit on %d %d %d\n",address,indxs[0],indxs[l]); 
} 

30 return numHits ; 

AddTraceback : 



return 0 ; 
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} 

Static GetFiagmentsUsedlnASiteC struct BitSetFileStnict *bitset , int whichSite , int ^* 
indxs) 

{ 

5 unsigned char ^bitArray ; 

int i ; 

intj; 

int size ; 

int '^dress ; 
10 int numHits — 0 ; 

int bit ; 

if (!(address = (ini 

*)irrL_MEM_C ALLOC(bitset- > numFragsInEachSite[whichSite] , 

sizeof(int))) ) 

15 goto AddTracd>ack ; 

Figure out how many ints there are in this bitset. 

*/ 

size = ( bitset- >actua]lSizes[whichSite] + 7 ) / 8 ; 
20 for ( bit Array = bitset- >fragmentBitset[whichSite], i = 0 ; i < size ; i++ ) 

{ 

if { bitArrayti] ) 
{ 

25 ** If any bit is set in the byte then we need to figure out what the hits are. 

for(j =0;j < 8 ;j++) 
{ 

if ( bitArraypJ & setbitsD] ) 
30 { 

address[numHits++l = i * 8 + j ; 
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} 

} 

} 

(*indxs) = address ; 
5 return numHits ; 

AddTracd>ack : 
return 0 ; 



sUtic struct BitSetFileStruct *ReadAndAllocate(char ^FileName ,int offset ) 
10 { 

struct BitSetFileStruct *bitset ; 

if ( !(bitset = (struct BitSetFileStruct *)UTL_MEM_CALL0C(1, 

sizeof(struct BitSetFileStruct )))) 
goto AddTraceback ; 
IS if ( !ReadCheckPointFile(fileName, 



&(bitset- > masterFilelnfo.masterFilePathName), 



offset. 



&(bitset- > masterFilelnfo. masterRecNo) , 



20 



25 



goto AddTraceback ; 



&(bitset- > programlnfo.progiamName), 

&(bitset->bitset), 

&(bitset- > num VariationSites), 

&(bits^- > actuallSizes), 

&(bitset- > ailocSizes), 

&(bitset- > totalSelected), 

&(bitset- > numFragsInEachSite), 

&(bitsct- > masterFilelnfo), 

&(bitset- > programlnfo.bufferSize) , 

NULL)) 
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return bitset ; 
AddTracd^ack : 

return ( struct BitS^ileStruci *)NULL ; 

} 

5 static int ReadBitsetCoreInfo(void *bs, char *'^asterFileNaine, int ^masterRecno, char 
**core, char *-*xrString, int *numSites, char ***xFilrf4ames ) 

{ 

struct BitSetFileStruct ^bitset = (struct BitSetFileStruct *) bs; 
int recNo ; 
10 FILE *fp; 
int i ; 

int found = 0 ; 
char *line ; 
char *cp ; 
15 char *cpl ; 

*numSites = bitset- >numVanationSites ; 

♦masterFileName = bitset- >masterFiIeInfo.masterFilePathName; 

*mast^Recno = bitset- >niasterFileInfo.masterRecNo; 

if ( !((*xFileNames) = (char **)UTL_MEM^CALLOC(*numSites, 

20 sizeof(char ♦)) )) 

goto AddTraceback ; 
for ( i = 0 ; i < *numSites ; i++ ) 
(♦xFileNanies)[i] - 
UTL_STR_SAVE(bitset- > niasterFiIeInfo.x_FileName[i]) ; 
25 /• 

Open the core file and read in the core and parse out the XRstring. 

*/ 

if ( !(fp = fopen(bitset->masterFileInfo.corefilePathName/r")) ) 
goto UnableToReadCore ; 
30 recNo = 0 ; 

found = 0 ; 
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whUe ( -1 != UTL^SCAN_GBrS{ fp, "W. &line)) 
{ 

recNo++ ; 

if { rccNo == = bitset- > masterFilelnfo. startCore ) 
5 { 

found — 1 ; 
break; 

} 

} 

10 if ((found) 

goto UnableToReadCore ; 

/• 

** Rq)lace all occurances of Y_Ox with Xx. 
*/ 

15 (•core ) = XJTL_STR_SAVEfline); 

cp = strstrOine."XRLIST="); 
if(!cp) 

* 

(*xrString) = UTL_STR_SAVE(""); 

else 
20 { 

/* ■ 

** Skip the first double quote. 

•/ 

cp += 8 ; 

25 /* 

** Go find the end of double quotes. 
•/ 

cpl = cp ; 
while ((*cp) !=•"•) 
30 cp++ ; 

•cp = 0 ; 

(•xrString) = UTL_STR_SAVE(cpl); 

} 
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fclose(fp); 
return 1 ; 
UnableToReadCore : 

fprintf(stderr/ReadBitsetCoreInfoO - Unable to read core %s 9Jd\n% 
5 bitset- > masterFilelnfo.corefUePathName, 

bitset- > masterFildnfo.startCore); 
AddTraceback : 

fprintf(stderr/ReadBitsetCoreInfoO - Unable to read core info\n"); 
return 0 ; 

10 } 

static int ReadMasterCoreInfo(char *masterFile, int index, char **corc, diar ♦♦xrStiing, 
int f numSites, char ***xFileNames ) 

{ 

int recNo ; 
15 FILE *fp; 
int ' i ; 

int found = 0 ; 

char *line ; 

char • *cp ; 
20 char *cpl ; 

char ♦prefix = (char *)NULL ; 

char ♦corcFile = (char *)NULL ; 

elm ♦fpFileName = (char *)NULL ; 

int fpOffset ; 
25 int mBits ; 

int IBits ; 

int startCore ; 

*numSites = 2 ; /* fixed for now */ 

if ( !((*xFileNames) = (char **)UTL_MEM_CALLOC(*numSites, 
30 sizeof(char *)) )) 
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goto AddTraceback ; 



** Get the master file info. 



if ( lGetMasterReoordHeader(inasterFile, 



10 



15 



index, 

&prefix, 

&mBits, 

&lBits, 

&coreFile, 

AstanCore, 

&(*xFileNames)[0], 

&{*xFileNaines)[l], 

numSites, 

&fpFileName, 

&fpOffset)) 



goto AddTraceback ; 



/■ 



Open the core file and read in the core and parse out the Xltstring. 

20 */ 

if ( !(fp = fopen(coreFile,"r")) ) 
goto UnableToReadCore ; 
recNo = 0 ; 
found = 0 ; 

25 while ( -1 != UTL_SCAN^GETS( fp, -\\^ "r, &line)) 

{ 

recNo++ ; 

if ( recNo = = startCore ) 
{ 

30 found = 1 ; 



break; 



> 



} 
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if (ffound ) 

goto UnableToReadCore ; 
(*corc ) = UTL_STR_SAVE(line); 
cp = stistr(Une,"XRLIST="); 
5 if(!cp) 

(*xrString) = UTL_STR_SAVE("); 

else 
{ 

/* 

10 Skip the first double quote. 
*/ 

q) += 8 ; 

/• 

Go find the end of double quotes. 

15 */ 

q>l = q) ; 

while ((*q)) !='"') 

q>++ ; 

*q) = 0 ; 

20 ('xtString) = UTL_STR_SAVE(cpl); 

} 

fclose(fp); 
if ( coreFile ) 

UTL_MEM_FREE(coreFile); 
25 if ( fpFileName ) 

UTL_MEM^FREE(fpFileName); 
if ( prefix ) 

UTL_MEM_FREE(prefix); 
return 1 ; 
30 UnableToReadCore : 

^rintf(stderr,"Rea(IMastersetCoreInfoO - Unable to read core %s %d\n\ 
coreFile,startCore); 
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AddTiaoeback : 

iprintf(stderr,*ReadMastersetCoreInfoO - Unable to read coie info\n"); 
if ( coieFile ) 

UTL_MEM_FREE(corcFile); 
S if ( ijpFiieNaine ) 

UTL_MEM_FREE(fpFileName); 
if ( prefix ) 

UTL_MEM_FREE(prefu); 
return 0 ; 

10 } 

static void DeallocateBitset( struct BitSetFileStnict "^bitset ) 
{ 

int i ; . 
if ( bitset->masterFi]eInfo.masterFilePathNaine ) 
15 UTL_MEM^FREE(bitset->niasterFUeInfo.masterFilePaA^ 
if ( bitset-> masterFilelnfo.corefilePathName ) 

UTL_MEM_FREE(bitset- > masterFilelnfo.corcfilePathName); 
if ( bitset-> masterFilelnfo.fingerFileName ) 

UTL_MEM_FREE(bitset- > masterFilelnfo. fingerFileName); 
20 if ( bitset->inastcrFileInfo.prefixForFiles ) 

UTL_MEM_FREE(bitset- > masterFilelnfo.prefixForFiles); 
for ( i = 0 ; i < bitset->masterFileInfo.nufnVariationSites ; i++ ) 

UTL_MEM_FREE(bitset- > masterFileInfo*x_FiIeName[i]); 
if ( bitset- > inasterFileInfo.x_FileNaine) 
25 UTL_MEM_FREE(bitset- > masterFilelnfo, xFileName); 

if ( bitset->programInfo.programName ) 

UTL_MEM_FREE(bitset- > programlnfo.programName); 
if ( bitset- > programlnfo.buffer ) 

UTL_MEM_FREE(bitset- > programlnfo.buffer); 
30 IHBDestroy(bi tsel- > bitset) ; 

if ( bitset- >actuallSizes ) 

UTL_MEM_FREE(bitset- > actuallSizes) ; 
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if ( bitset->allocSizes ) 

UTL_MEM_.FREE(bitset- > allocSizes); 
if ( bitset->numFragsInEachSite ) 

UTL.MEM_FREE(bitsei- > nuroFragsInEachSite); 
5 UTL_MEM^FREE(bitset); 

bitset = (struct BitSetFiieStnict *) NULL ; 

void CS^PRDCT_BITSET_DUMP( struct BitSetFileStruct *bitset ) 
( 

10 inti; 
int indx ; 
int indxl ; 
int indx2 ; 

fl>rintf(stderr/Master file name : - 
15 %s\n* ,bitset- > masterFilelnfo. masterPilePathName); 

fprintf(stderr,*Master file rec : %d\n*',bitset->masterFileInfo.masterRecNo); 

iprintf(stderr/ProgFam Name : %s\n*',bitset->programInfo.programName); 

fprintf(stderr, "Number of Sites : %d\n*')bitset->numVariationSites); 

fyrin tf(stderr, "Number Selected : %d\n* , bitset- > totalSelected) ; 
20 fiprintf(stderr,"Actual Sizes : 

for ( J = p ; i < bitset- > numVariationSites ; i++ ) 
fiprintf(stderr, " %d * ,bitset- > actuallSizesp]) ; 

iprintf(stdaT,"\n*); 

fprintf(stdOT,"Alloc Sizes : 
25 for ( i — 0 ; i < bitset- >numVariatiGnSites ; i++ ) 

fprintf(stderr/%d ",bitset->allocSizes[il); 

fiprintf(stdenr,"\n*); 

fprintf(stderr,"Num Frags in X? : 

/* 

30 ** If the number of fragments is zero then we will write -1 to tell others 
** to calculate this themselves. 
*/ 

for ( i = 0 ; i < bitset- > num VariationSites ; 1+ + ) 
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fSprintfCstdeir/^d *',(bitset->numFragsInEad)Siteri] == 0 )?-l: 

bitset- > numFiagsInEachSite[i]) ; 

fiprintf(stderr,"\n"); 
fprintf(slderr/Selections : Xn*); 
S indx = -1 ; 

do 

{ 

indx = IHBFindNextOne(bitset->bitset»indx+l); 
if (indx == -1 ) 
10 break; 

indxl = indx / bitset- >allocSizes[l] ; 
indx2 = indx % bitset- >allocSizes[l] ; 
fprintf(stderr/%d %d\n", indxl 4- 1 ,indx2 + 1 ); 
} while ( 1 ); 

15 } 

void CS_PRDCT_BITSET_GET_HITS( struct BitSetPileStnict *bitset , int **indexes) 
{ 

int i ; 
int indx ; 
20 int indxl ; 
int indx2 ; 
int hitNo = 0 ; 

indx = -1 ; 

do 

25 { 

indx = IHBFindNextOne(bitset- > bitset, indx + 1 ) ; 
if (indx ==-l ) 
break; 

indxl = indx / bitset- >allocSize$[l] ; 
30 indx2 = indx % bitset- >allocSizes[l] ; 

indexes[0] [hitNo] = indxl + 1 ; 
indexestlllhitNol = indx2 + 1 ; 
hitNo+4' ; 
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} while ( 1 ); 



/* 

5 ** 



** Function Name : CS_PRDCT_BITSET_OPEN0 



Purpose : Function will read in the header for a CS product bitset. 

10 ** 



Usage : 



** Returns : A handle to the product bitset info structure or NULL on 
** error. 

15 

Algorithms : None. 

mm 



mm 
mm 



Revision History : 



20 ** Author Date Description 



** Fred Soltahshahi 

mm 



07/26/96 Origina] version. 



25 **-E: 
*/ 

void *CS^PRDCT_BrrSET_OPEN( char *bitsetFileName , int offset ) 
{ 

struct BitSetFtleStruct *bitset ; 
30 if ( ! (bitset = Read And Allocate(bitsetFileName,offset)) ) 

return (void *)NULL ; 
bitset- >totalSeIected = IHBCountOnes(bitset-> bitset. 
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0, IHBBitSize(bitset->bitset)); 

/* 

** If die pn^ram did not fceq> track of and ou^ut this to the file then we 
need to calculate it ourselves. 

5 */ 

if( (bitset->numFragsInEachSiteIO] ==0) j| ( bitset->numFragsInEa(*Site[0] 

==-1)) 
{ 

CalculateFragsInSties(bitset); 

10 } 

retum (void '^)bitset ; 

} 

./* 

•*+E: 
15 *• 

** Function Name : CS_PRDCT_BITSET_CLOSE0 
** 

•* Purpose : Function will close a bitset file and cleanup allocated. 
20 ** memory. 

*♦ Usage : 

*♦ Returns : None. 
25 *♦ 

** Algorithms : None. 
«« 

♦* Revidon History : 
** 

30 Author Date Description 

— — - — — — — = = = = = = = = 
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•* Fred Soltanshahi in/26/96 Original version. 
*/ 

5 vmd CS_PRDCT_BITSET_CLOSE( struct BitSetFileStruct *bitset ) 
{ 

DeallocateBitset(bitset) ; 

} 

/* 

10 •*+£: 



4c« 



Function Name : CS_PRDCT_BITSET_WRITEO 



15 ** Purpose : Function will write a bitset into the given file. 
** Usage : 

R^ums : 1 on success or 0 on failure. 

20 ** 

** Algorithms : None. 

Revision History : 

mm 

25 ** Author Date Description 



mm 



*♦ Fred Soltanshahi 08/02/96 Original version. 

mm 

30 **-E: 
*/ 

int CS_PRDCT_BrrSET_WRITE(char *fileName,char *programName,struct 
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BitSetFileStruct '*pnxiuctBitset,int progBufferSize,int *progBuffer) 
{ 

if ( !WriteOutConipiessedBSFile(rileName, 

productBitset- > masterFilelnfo, masterFilePathName. 

productBitset- > masterPildnfo.masterRecNo, 

programName, 

productBitset- > bitset, 

productBitset- > numVariationSiles, 

productBitset- >actuaIlSizes, 

productBitset- > allocSizes, 

productBitset- > totalSdected, 

productBitset- > numPragsInEachSite, 

progBufferSize, 

progBuffer)) 

goto AddTraceback ; 
letum 1 ; 
AddTraceback : 

fprintf(stdeiT/CS^PRDCT^BITSET^WRITEO-Unabie to write bitset fileXn"); 
return 0 ; 



20 } 



**+E: 

mm 



25 ** Function Name : CS^PRDCT^BITSET^CREATEO 



mm 

** Purpose 

m* 
mm 

30 ** Usage : 



: Function will create an in-memory product bitset from a 
master file. 
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Returns : A handle to the product bitset info structure r NULL on 
** error. 

•* Algorithms : None. 
5 ** 

•* Revision History : 
** 

♦* Author Date Description 

10 = = = = = = = = = - = = = 

Fred Soltanshahi 08/02/% Original version. 



mm 



♦*-E: 
♦/ 

15 void •CS_PRDCT_BITSET_CREATE(char *mastCTFileNaitie. 

int masterRecNumber, 
int ^InitRawMtset) 

{ 

Struct BitSetFileStruct *bitset ; 
20 if ( !(bitset = ReadAndAllocateMaster(masterFileName, 



masterRecNumber, 

return (void *)NULL ; 

25 else 

return (void *)bitset ; 



initRawBitset)) ) 



/* 

30 ** 

m* 



** Function Name : CS_PRDCT_BITSET_SETBITS() 
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Purpose : Function will copy a raw bitset into the ChemSpace product 
** bitset format. 

5 ** Usage : 

** Returns -z-l on success or zero on failure. 

** Algorithms : None. 
10 ** 

** Revision History : 

** Author Date Description 

«4t ==: = = = = = =i = = = = = r= = = =r=:= = = = =r = =i = = 

15 = = = = = = « = = = = = = 

*♦ Fred Soltanshahi 08/02/96 Original version. 

*/ 

20 int CS_PRDCT_BITSET_SETBITS(void *bs, int •rawBS, int numProducts) 

{ 

Struct BitSetFileStruct *bitset = (struct BitSetFileStnict *)bs ; 
void ^compressed ; 
static int firstTime = 1 ; 
25 int i; 
int total; 

char ^ = (char *)rawBS ; 
int rowLength ; 
int indexl ; 
30 int index2 ; 
int byte ; 
int bit ; 

int totalSelected = 0 ; 
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if ( firsfTime ) 
{ 

InitO: 

firstTime = 0 ; 

5 } 
/• 

** Just create a new one. 
*/ 

m 

if (bitset->bitset) 
10 IHBDestioy(bits^- > bitset ) ; 

if ( !(bitset->bitset = CreateCompressedBitSet(rawBS, 

bitset- > numVariationSites, 

15 

bitset- > actuallSizes, 

bitset- > allocSizes) ) ) 

goto UnableToCreateBitSet ; 
20 total = bitset- >actuallSizes[0] ; 

for ( i = 1 ; i <' bitset- > numVariationSites ; i++ ) 

total *= bitset- >actuallSizesfi] ; 
if ( numProducts == -1 ) 

I* 

25 Calculate what products are being set. 
*/ 

{ 

numProducts = 0 ; 

rowLaigth = bitset- >actuallSizes[l] ; 
30 for ( i = 0 ; i < total ; i++ ) 

{ 



byte = ( i ) / 8 ; 
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bit = ( i ) % 8 ; 
if ( q)[byte] & setbits[bit] ) 
numProducts++ ; 

} 

5 } 

bitset- > totalSelected numPioducts ; 
return 1 ; 
UnableToCreateBitSet : 

fprintf(stdOT/CS_PRIXT_BmET_SETBrrS" Unable to set bit\n"); 
10 return 0 ; 

} 



•*+E: 



15 



Function Name : CS_PRDCT_BITSET_TO_RAW 0 
«* 

** Purpose : Function will copy a ChemSpace product bitset to a 
** raw bitset format. 

20 ** 

** Usage : calloc rawBS before call. useAlloc nonzero to use allocated 
rather than actual dimensions 

** Returns : 1 on success or zero on failure. 
25 ** 

** Algorithms : None. 

** Revision History : 
*♦ 

30 ** Author Date Description 
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•* David Patterson 09/09/96 Original veraon. 

**-E: 
•/ . 

5 int CS_PRDCT_BrrSET_TO_RAW (void *bs, int *rawBS, int useAUoc) 
{ 

-CS_PRDCT-BrrSET_CONCAT_RAW(bs. fawBS, 0,-useAlloc); 
return 1; 



10 int CS_PRDCT_BITSET_CONCAT_RAW(void *bs, int •lawBS, int offset, 

int useAlloc) 

{ 

int *indxs = 0; 
int address, sum, b; 
15 struct BitSetFileStruct *bitsct = (struct BitSetFileStruct *) bs; 

for ( address = -1 , b = 0 ; b < bitset->totalSdected ; b++ ) 

{ 

address ~ IHBFindNextOne(bitset->bitset,address+l); 
BitSetAddressTolndexes(bttset,address,&indxs,0); 
20 if (useAlloc) 

FlagProduct(rawBS, 0,0, address+offset); 
dse /* must explicitly calculate the address 
{sum== CS^PRD(n'_BrrSETJNDEXES^TO_INDEX( bitset, indxs) ; 
FlagProduct(rawBS, 0,0, sum+offset); 

25 } 
} 

UTL_Mra4_FREE(indxs); 
return 1; 

} 

30 /* 
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mm 

*♦ Function Name : CS^PRDCT^BrrSET^SELECTED 0 
** 

5 Purpose : Function wiU return a ChemSpace bitset*s totalSdected 
Usage : 

Returns : int^^ count of selected bits in bitset 

*♦ 

** Algorithms : None. 

** Revision History : 
** 

15 Author Date Description 



10 



David Patterson 09/24/96 Origina] version. 

20 *?=E: 
*/ 

int CS_PRDCT_BrrSET_SELECTED (void •bsvoid ) 
{ 

struct-BitSetFlleStnicl *bs = (struct BitSetFileStnict •) bsvoid; 
25 return bs- > totalSelected ; 

} 

/• 

♦♦+E: 
** 

30 *♦ 

** Function Name : CS_PRDCT_BITSET_REVEAL 0 
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Purpose : Function will return a ChemSpace bitset's struct info to 
** external calling program, 

** Usage : 
5 ** 

Returns : 1 on success or ucio on failure. 

4t4t 



*♦ Algorithms : None. 



10 ** Revision History : 

** Author Date Description 



15 ** David Patterson 09/10/96 Original version. 

*/ 

int CS_PRDCT__BrrSET^REVEAL (void *bsvoid, 
20 char **MasterFile_Bitset, 

int *StartRec_Bitset, 

int ^BitsInAbsenda, 

int *BitsInAbsentiaNoCount, 

char **CorcFile, 
25 int *StartCore, 

char **FngrFile, 

char ***Xfiles, 

int •*nY, 

FILE **FngrFile_File, 
30 int *FingeiOff. 

char **ScreenFileName, 
int *BytesPerFingerPrint, 
int *WordsPerFingerprinl, 
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int **query, 

int ♦*FingerCore_FP, 

int ♦FingerCore_Caid ) 

{ 

S int size; 
int *fooi; 

struct BitS^ileStnict *hs = (struct BitSetFileStruct *) bsvoid; 
if (MasterFiie.Bits^) 

*MasterFile_Bitset = bs- > masterFildnfo. itiasterFilePathName ; 
10 if (StartRecJ3itset) 

♦StaitRec^Bitset =bs->nruisterFileInfo.inasterRecNo ; 
if (BitsInAbsentia) 

^BitsInAbsentia = bs- > masterFilelnfo. numberOfMissingBits; 
if (BitsInAbsentiaNoCount) 
IS ^BitsInAbsentiaNoCount = bs-> masterFilelnfo. Ibits; 
if(CoreFile) 

^CoreFile = bs- > masterFilelnfo.corefilePathName; 

if (StartCore) 

*StartCore = bs-> masterFilelnfo. startCore; 

20 if (FngrFile) 

♦FngrFile = bs- > masterFilelnfo. flngerFileName; 

if (Xfiles) 

*Xfiles = bs- > masterFilelnfo. x_FileName; 

if(nY) 

25 *n Y = bs- > actuallSizes; 

if ( FngrFile^File) 

{ if (!((*FngrFile_File) = UTL^FILE^FOPEN((*FngrFile),"r-))) return 0; 

if (!UTL^FILE_FREAD(&i,sizeof(int),l.*FngrFae_File)) return 0; /* nbits 

fp*/ 

30 *BytcsPerFingerPrint = ( i + 7 ) / 8 ; 

♦WordsPerFingerprint = (*BytesPerFmgerPrint 4- 3) / 4; 
(♦query) = (int *) UTL_MEM^ALL(X:( *BytesPerFingerPriiit); 
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if (!UTL_FILE_FREAD(&i.sizeof(in0.1,*FngrFiIe_File)) return 0; /* record 

cnt */ 

if (!UTL_FILE_FREAD(&i,sizeof(int), 1 ,*FngrFile_File)) return 0; /* record 

size */ 

5 rewind(*FngrFile_File); 

if (!(fooi = Ont •) UTL_MEM_ALLOC( i ))) return 0; 

aze = (3+i)/4 ; 

for( i=0; i<= •FingerOff, i++) 

if (!UTL_FILE_FREAD( fooi.sizeofCint),size,*'FngrFile_File)) 
10 return 0; 

/* if ( fooi[l] ! = 2 + nY_01 * n Y_02 ) return 0; */ 
if ( ScreenFileName ) 
( 

if (!((*ScrBenFileName) = UTL_STR_SAVE(fooi+4))) return 0 ; 

15 } 

if ( FuigerCore_FP ) 
{ 

*FingerCore_FP = foci; 

if (!UTL_FILE_FREAD( FingerCore_Card,sizeofOnt),l, 

20 •FngrFile_Fae)) 

return 0; 

if (!UTL_FILE_FREAD(*FingerCore_FP , 

sizeof(int), 

♦WordsPerFingerprint, 

25 *FngrFile_Fi!e)) 

return 0; 

} 

} 

return 1; 

30 } 
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•*+E: 

♦* Function Name : CS_PRDCT_BITSET INDEXES^TOJNDEXQ 
5 ** 

** Purpose : Function will return the right bit given a set of indices 

Usage : all indexes are 0 based. 

10 Returns : index to use in bitset. 
** ■ • 

** Algorithms : None. 

♦♦ Revision History : extracted from CS_PRDCT_BITSET_SET_PRD_Brr by 
15 ** David Patterson 

♦♦ Author Date Description 



20 ♦* Fred Soltanshahi 08/02/96 Original version. 



•*-E: 
•/ 

int CS_PRDCT_BrrSET_INDEXES_TO_INDEX( stnict BitSetFileStnict "bitset, 
25 int '^indexes) 

{ 

int i ; 
int j ; 

int rowLength[MAX_VARIATION JITESl ; 
30 int indx - 0 ; 

for ( i = 0 ; i < bitset- > numVariationSites ; i++ ) 

{ 

rowLength[i] = 1 ; 
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for ( j = i + 1 ; j < bitset- > numVariationSites ; j++ ) 
rowLength[i] bitset- >actuallSizes[j] ; 

} 

for ( i = 0 ; i < bitset- > numVariationSites ; i++ ) 
5 { 

indx += indexes[i] * rowLength[i] ; 

} 

return indx ; 

} 

10 /* 

♦* Function Name : CS_PRDCT_BITSET_ALLOC_SIZE_INDEXES_TO_INDEX0 

15 

Purpose : Function will return the right bit given a set of indices 
it uses the allocated sizes in the bitset to get the info. 



mm 



20 int CS_PRDCT_BITSET_ALLOC_SIZE_INDEXES_TO_INDEX( struct BitSetFileStnict 
*bitset, 

inl *indexes) 

{ 

int i ; 
25 int j ; 

int rowLength[MAX_VARIATION_SITES] ; 
int indx = 0 ; 

for ( i = 0 ; i < bitset- > numVariationSites ; i-h+ ) 
{ 

30 rowLengthfi] = 1 ; 

for(j = i + 1 ; j < bitset- > numVariationSites ; j + + ) 
rowLengthfi] bitset- >allocSizes[jl ; 
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} 

for ( i = 0 ; i < bitset->nuinVariationSites ; i++ ) 
{ 

indx + = indexes[i] * rowLengthfi] ; 

5 } 

return indx ; 

} 

10 ** 

** Function Name : CS_PRDCT_BITSET_SET_PRD_BITO 

** Purpose : Function will set a product bit with the given indexes. 

15 

Usage : 

R^ums : none. 

mm 

20 ** Algorithms : None. 

mm 

** Revision History : 

mm 

Author Date Description 

25 ♦* = = = = = = = = = = = = = = = = = = - = = - = 



Fred Soltanshahi 08/02/96 Original version. 

mm 

30 */ 

int CS_PRDCT_BrrSET_SEr_PRD_Brr(void *bs, int ♦indexes) 
{ 
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Struct BitSetFileStnict *bitset = (struct BitSetFileStruct *)bs ; 
int indx = 0 ; 

indx == CS_PRDCT_BITSET_.ALlXH:^SIZEJ^roEXES_TOJNDEX0^^ 
indexes); 

5 IHBSet(bitset->bitset, indx ); 

bitset->totalSdected++ ; 
return 1 ; 

} 

/* 

10 **+E: 

<»4> 

•* Function Name : CS_PRDCT_BrrSET_GET_RINFO() 

IS Purpose : Function will return the Reaction/Reagent info from 

the bitset file. 

♦* Usage : 

20 ** Returns : none, 

** Algorithms : None. 

** Revision History : 
25 ** 

** Author Date Description 



** Fred Soltanshahi 01/03/97 Original version. 

30 ** 
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int CS_PRDCT_^BrrSET_GET_RINFO(void •bs, char **reactionInfo,char ***ieagentIiifo) 
{ 

struct BitSetFUeStnict *bitset = (struct BitSetFileStnict *)bs ; 
^reactionlnfo = bitset->inasterFileInfo.prefixForiPiles ; 
"^reagentlnfo = bitset->ma^FiIeInfo.reagenanfo ; 
return 1 ; 

} 



10 



15 



/* 



m* 

mm 
mm 
mm 
mm 
mm 



Function Name : CS_PRDCT_BrrSET_GET_STATSO 

Purpose : Function will return the statistics for a bitset file, 
these will include numberOfSites, originalSizes, 
numberOfProducts and Number of fragments used at eadi 
variation site. 



** Usage : 
20 *♦ 

** Returns 

mm 



: none. 



** Algorithms : None. 
** 

25 ** Revision History : 

mm 

** Author 

mm 



Date 



Description 



30 ♦* Fred Sollanshahi 

mm 



08/05/96 Original version. 
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int CS_PRDCT_BITSET_GET_STATS(void *bs, int '^umSites. int *nuinProducts, 
int **si2es, int **nuinUsed ) 

{ 

5 struct BitSetFileStnict *bitset == (struct BitSetFileStnict *)bs ; 
'^umSites = bitset->numVariationSites ; 
^umProducts = bitset->totalSel6Cted ; 

/* 

** Allocate buffers, if they have not been. 
10 •/ 

if ( !(*sizes) ) 
{ 

if ( !((*sizes) = (int *)UTL_MEM_CALLOC(*numSites,sizeof(int)))) 
goto UnableToAIlocateMemory ; 

15 } 

if(!(*numUsed)) 

{ 

if ( !((*numUsed) = Gnt *)UTL_MEM_CALLOC(*numSites,si2eof(int)))) 
goto UnableToAIlocateMemory ; 

20 } 

memq)y(*sizes, bitset->actuallSizes, sizeof(int) * *numSites ); 
memcpy(*numUsed, bitset->numFragsInEachSite, sizeofOnt) * *numSites ); 
return 1 ; 
UnableToAIlocateMemory : 
25 Iprintf(stderr/CS_PRDCT_B1TSET GET_STATS() - Unable to allocate 

memory\n"); 

return 0 ; 

} 



30 **+E: 
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mm 
mm 
mm 
mm 



Function Name : CS_PRDCT_BITSET COREJNFOQ 



Purpose : Function will get the xfile and core and xrstring info 
5 ** from the bitset file. 

mm 

*♦ Usage : 

mm 

*♦ R^ms : 1 on success or 0 on emr. 
10 ** 

Algorithms : None. 

mm 

Revision History : 

mm 

15 ** Author Date Description 



** Fred Soltanshahi 08/09/96 Original version. 

mm 

20 

♦/ 

int CS_PRDCT_BITSET_CORE_INFO(void *bs, char **inasterName, int •masterRecno, 

char **core, char •*xrString, int '^numSites, char •♦♦xFileNames ) 

{ 

25 return ( ReadBitsetCoreInfo(bs,masterName,masterRecno, 

core,xrString,numSites,xFileNames)); 

} 

/* 

30 ** 
. •* 

*• Function Name : CS_PRDCT_BrrSET_PROG_NAME() 
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«4t 



Puqx)se : Function will get the program name that produced this bitset. 



mm 

ft 

** Usage : 

5 

Returns : 1 on success or 0 on error. 

mm 

** Algorithms : None. 

mm 

10 Revision History : 

mm 

** Author Date Description 



15 ** Fred Soltanshahi 



08/09/96 Original version. 



*/ 

int CS_PRDCT_BrrSET_PROG_NAME(void *bs, char **programName) 
20 { 

*programName = ((struct BitSetFileStruct *)bs)->programInfo.prpgramName ; 
return 1 ; 

} 



25 **+E: 

m* 



mm 
mm 

mm 



Function Name : CS_PRDCT_MSTR_COREJNFO0 



30 ** Purpose : Function will get the xfile and core and xrstring info 
** from the master file. 



mm 
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Usage : 

Returns : 1 on success or 0 on error. 
5 ** Algorithms : None. 

mm 

** Revision History : 

** Author Efate Description 

10 = = = = = = = = = = = = = = = = = = ^^^^^^^^ 

Fred Soltanshahi 08/09/96 Original version. 

*♦ 

•*-E: 
15 •/ 

int CS__PRDCT_MSTR_CORE_INFO(char *inasterFile. int index, diar **core, char 
**xrString, int *numSites, char ***xFileNames ) 

{ 

return ( ReadMasterCoreInfo(masterFileJndex 
20 »core,xrString,numSites,xFileNames)); 

} 



/* 
** 

25 *♦ 

♦* Function Name : CS_PRDCT^BITSET^CREATE_BIT_STRING() 

** Purpose : Function will create a compressed version of a raw bit set. 

It returns the memory size needed to hold the bitseL 

30 ** 

** Usage : 
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** 

Returns : pointer to a compressed bitset(this is not a ChemSpace 
product bitset but just a compressed bitstring) 

mm 

5 Algorithms : None. 

mm 



mm 
mm 
mm 



Revision History : 



Author Date Description 



mm 



mm 



Fred Soltanshaht OS/06/96 Original ver^on. 



15 */ 

void *CS^PRDCT_BITSET^CREATE^BIT^STRING( int *rawBits. int offset, int 

numVariations, int *si2es, int *allocSi2es, int ^'totalSize) 

{ 

void ^compressed ; 
20 if ( !(compressed = CreateCompressedBitSet(rawBits, 



offset. 



numVariations, 



sizes, 



25 

allocSizes) ) ) 

goto UnableToCreateBitSet ; 
*totalSize = IHBRealSize(compressed); 
return compressed ; 
30 UnableToCreateBitSet : 

fiprintf(stderr/CX^PRDCT_BITSET,CREATE^BIT^STRING() - Unable to create 
bitset\n"); 

return ( void *)NULL ; 



I 
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/■ 



+E: 



Function Name : CS_PRDCT_BITSET_DESTROY_BIT_STRING0 



Purpose 



: Function will destroy the menu>ry for a bitstring 
allocate by the CREATE call above. 



10 ** 

** Usage : 



mm 



Returns 



: none 



15 Algorithms : None. 



20 



mm 



mm 



mm 



mm 



mm 



Revision History : 



Author 



Date 



Description 



** Fred Soltanshahi 08/06/96 Original version. 

mm 



**-E: 



25 */ 



void CS_PRDCT_BITSET_DESTROY^BIT_STRING( void *bitset) 
{ 

IHBPestroy(bitset); 

} 



30 /♦ 
*♦+£: 
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** 
mm 

Function Name : CS_PRDCT_BrrSET_GErHITSO 

mm 

S Purpose : Function will return the indexes^nto the original XI, X2 files 

for the requested number of hits. 

mm 

♦* Usage : 
** 

10 ** Returns : Number of hits found or -1 for error. 

mm 

** Algorithms : None. 

mm 

Revision History : . 

15 

** Author Date I>escription 



** Fred Soltanshahi 08/07/96 Original version. 
20 ** 

•/ 

int CS^PRDCT_BITSET_GETH1TS( void *bs, int offset, int numberOfHits, inl 
***hitlndexes) 
25 { 

struct BitSetFileStruct *bitset - (struct BitSetFileStruct *)bs ; 
int numFound ; 
int numConnections ; 
static int *bitAddresses == (int *)NULL ; 
30 static int numBitAddresses = 0 ; 
int *indxs = (int *)NULL ; 
int start; 
int count ; 
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int 1 ; 
intj ; 

(*hiandexes) = (int **)NULL ; 
numConnecdons = bitset->numVarialionSites ; 

5 /* 

Local housekeq>ing . 

♦/ 

if ( numb^fHits > numBitAddresses ) 
{ 

10 if ( IbitAddresses ) 

{ 

if ( !(bitAddresses = fint *)UTL_MEM_CALL0C(numberOfHits, 

^zeof(int))) ) 

goto UnableToAIiocate ; 

15 } 

else 

{ 

■ 

if ( !(bitAddresses = (int *)UTL_MEM-REALLOC(bitAddrcsscs, 

20 numb^fHits* sizeof(int))) ) 

goto UnableToAIiocate ; 

} 

numBitAddresses = numberOfHits ; 

} 

25 /* 

** Figure out if we have the number of hits he wanted and what their addresses 
** are in the bitset file. 

mm 

** We will have to come back and speed this up if it is to slow, but for now 
30 ** lets get it working. 
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/* slart = bitset->firstHitAddress ; */ 

start = -!;/* start from the boning */ 

for ( count = 0 ; dbunt < offset ; count+ + ) 

5 { 

start = IHBFindNextOne(bitset->bitset,start+l); 

I* 

** Lets remember where the first hit is, this should save us some time later. 

10 if ( bitset- > firstHitAddress < = 0 ) 

bitset->firstHitAddtess = start ; 
if ( start ==-1) 
{ 

return 0 ; 

15 } 
} 

/* 

Now lets see how many bits are set from here on. 

*/ 

20 for ( numFound = 0 ; numFound < numberOfHits ; numFound++ ) 

{ 

start= IHBFindNextOne(bitset- > bitset,start4- 1); 
if (start == -1 ) 
break; 

25 bitA{ldresses[numFound] = start ; 

} 

** Allocate the arrays. 
*/ 

30 if ( !(*hitlndexes = (int **)in'L_MEM_CALLOC(nuinConnections,sizeof(int ♦))) ) 

goto UnableToAIlocate ; 
for ( i = 0 ; i < numConnections ; i + + ) 



I 
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if ( !((*hitlndexes)ri] = (int *)UTL_MEM_CALLOC(nuinFound, 



S]zeof(int ))) ) 

goto UnableToAllocate ; 

5 } 
/* 

** Now translate each one of the bitset addresses to the variation-site 
indexes. 

*/ 

10 for ( i = 0 ; i < numFound ; i++ ) 

{ 

BitSetAddressToIndexes(bitset,bitAddresses[i] ,&indxs,0); 
for ( j = 0 ; j < numConnections ; j++ ) 

(*hitIndexes)Dl[i] = indxs[j] + 1 ; /* Translate to 1 based indexes */ 

15 } 

if ( indxs ) 

UTL_MEM_FREE( indxs ); 
return numFound ; 
UnableToAllocate : 
20 AddTraceback : 
if ( indxs ) 

UTL_MEM_FREE( indxs ); 
return -1 ; 

} 

25 /* 
** 

mm 

** Function Name : CS_PRDCT_BnrSET_GET PARTIAL^HITSO 
30 ** 

*♦ Purpose : Function will return the indexesfinto the original XI, X2 files 



mm 



for the requested number of hits. 
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I 

** Usage : 

Returns : Number of hits found or -1 for error. 

5 *♦ 

** Algorithms : None. 



** Revision History : 



10 ** Author Date Description 

— = = — — — — 



mm 



Fred Soltanshahi 08/07/96 Original version. 
•* 

15 **-E: 
♦/ 

int CS_PRDCT_BITSET_GET_PARTIAL_HrrS{ void *bs, int *numProducts, int site, int 
numl^xedSites, int ^fixedSitesIndexes, int *numFragmentsPerSite, int '^'*'httlndexes ) 

{ 

20 struct BitSetPileStruct *bitset = (struct BitSetFileStruct *)bs ; 
int total ; 

* 

(♦hitlndexes) = (int *)NULL ; 
(3etPartiaIProductsStats( bitset , 

numPixedSites, 

25 fixedSitesIndexes, 

&total, 

numPragmentsPerSite); 

(♦numProducts ) = GetPartialProductsAddresses(bitset, 

numFixedSites, 

30 fixedSitesIndexes, 

site, 

hitlndexes); 
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return 1 ; 

} 



5 ** 

** Function Name : (X_PRIXT_Bn^ET_GET_PRDCT_PARTIAL_HITSO 



Purpose : Function will return the indexesOnto the original XI, X2 files 

10 ♦* for the requested number of hits, 

*♦ 

** Usage : This works when the csln is actually being exploded. 
** 

Returns : Number of hits found or -1 for error. ' 

15 ** 

** Algorithms : None. 

** Revision History : 

20 ** Author Date Description 

m* : 



** Fred Soltanshahi 08/07/96 Original version. 



25 **-E; 
♦/ 

int CS_PRDCT_BITSEr_GET^PRDCT_PARTIAL^HITS( void '«'bs, int *numProducts, int 
site, int numFixedSites, int ^fixedSitesIndexes, int *numFragmentsPerSite, int ^^hitlndexes 

) 

30 { 

struct BitSetFileStnict ^bitset = (struct BitSetFileStruct *)bs ; 
int total ; 
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(♦hitfndexes) = (int *)hfULL ; 
GetPartialProductsStats( bitset , 

numFixedSites, 
RxedSitesIndexes, 
S &total, 

numFragm^tsPerSite); 

(*numProducts ) = GetPartialPioductsAddiesses(bitset, 

nunriFixedSites, 
fixedSitesIndexes, 
10 site, 

hitlndexes); 

(♦numProducts ) = total ; 
return 1 ; 

} 

15 /* +E 

Abstract: For Chemspace bitset file call callback with products choices not selected. 
Input: 

L This function takes a BitSetFileStmct returned most likely from: 

CS_PRDCT_BITSETlOPEN(char ^filename) 
20 2. A void pointer which is passed to callback function. This is for 
whatever you want. 
3. A pointer to function returning: 

int (void "^idata, int numVariants, int ^choices ). 
choices is of size num Variants, the choices are zero based, and 
25 choices[0] is the choice for markush Y_01, choice[l] for Y_02 etc. 

NOTE 1: num Variants of -1 and a null for choices is passed to signify 
the end of the choices excluded, just in case the function 
want to do some special processing at the end. 
NOTE 2; The return value from the callback function is ignored. 
30 Returns: 

Total number of bits excluded. 
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-1 Upon error, 

** Author Date Description 

______ — „ 



5 **RobJilek 07/26/96 Original version. 

*/ 

int CS_PRDCT_BrrSET_ZERO(struct BitSetFUeStruct ♦bitset. void *udata, 
int (*ZeroProducts)(void *udata, int numVariants, int '^:hoices ) ) 

{ 

10 BIT^TRACKING bt[l]; 

if ( bitset- > numVariationSites < = 0 ) 
return -1; 

bt- > num Variations = bitset-> numVariationSites; 
bt->bitset = bitset; 
15 bt- > call_udata = udata; 

bt- > funq)tr = ZeroProducts; 

bt-> choices = Ont *) UTLMEM_CALLOC(bt-> num Variations, sizeof(int) ); 
bt- > totalExcIuded 0; 

/* The sequence is as follows: 
20 IHBRange has a loqi to find zeros/ones. 

* 

It calls RangeCallback 

RangeCallback calls ZeroProducts callback, 
while ( not end of list ) 

call RangeCallback with start and end Range. 
25 for ( i = startRange; i < = EndRange; 1+4- ) // 

RangeCallback 

calculate product array. 

call ZeroProducts // 

ZeroCallback 
30 */ 

IHBRange(bitset-> bitset. 0. (void *) bt, RangeCallback ); 
UTL_MEM_FREE((char *) bt-> choices ); 
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letum bt->totalExcIuded; 



} 

/*+! 



Synopsis: Gets called for each range of bits set. It th^ 
5 converts each bit to a product array and calls callback for each. 
*/ 

static int RangeCallback (-void *udata, int startKange, int ^dRange ) 
{ 

Brr^TRACKING *bt = (BIT^TRACKING *) udata; 
10 int indx; 

int oor; 
int skip; 

void *call_udata; 
int numVan 
15 int ^choices; 



for ( indx = startRange; indx < = endRange; ) 
{ 

skip = BitSetAddressToIndexes(bitset,indx,&choices,&oor); 
if ( !oor ) 



20 



void *bitset; 

call_udata = bt->call_udata; 
numVar = bt->num Variations; 
choices - bt-> choices; 
bitset = bt->bitset; 



25 



{ 



(♦bt->funcptr)(call_udata, numVar, choices ); 

bt- > totalExcluded + + ; 

indx++; 



30 



else 



{ 



if (skip > 0) 

indx += skip; 



I 
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dse 

indx++; 

} 

} 

5 (*bt->funq)tr)(call_udata,-l, (int *) 0 ); /♦ Signify end of zeros. */ 

return 0; 

} 

/• +E 

Abstract: For Chemspace bitset file call callback with products choices selected. 
10 Input: 

1. This function takes a BitSetFileStnict returned most likely from: 

CS_PRDCT_BITSET_OPEN(char *filename) 

2. A void pointer which is passed to callback function. This is for 

whatever you want. 
IS 3. A pointer to function returning: 

int (void *udata, int numVariants, int *choices ). 
choices is of size numVariants, the choices are zero based, and 
choices[0] is the choice for markush Y_01, choice[l] for Y_02 etc. 
NOTE 1: numVariants of -1 and a null for choices is passed to signify 
20 the end of the choices excluded, just in case the function 

want to do some special processing at the end. 
NOTE 2: The return value from the callback function is ignored. 
Returns: 

Total number of bits included. 
25 -1 upon error. 

See Also: CS_PRDCT_BITSET_ZERO 

** Author Date Description 



30 **RobJilek 08/19/96 Original version. 

*/ 

int CS_PRDCT_BITSET_OhfE(struct BitSetFileStnict ♦bitset, void *udata. 
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int (*OneProducts)(void *udata, int numVariants, int *choices ) ) 
{ 

BIT_TRACKING bt[l]; 

if ( bitset->numVariationSites < = 0 ) 
5 return -1; 

bt->nucn Variations = bitset->numVariationSites; 

bt->bitset = bitset; 

bt->call_udata = udata; 

bt->funq^ = OneProducts; 
10 bt-> choices = (int *) UTL_MEM_CALLOC(bt->num Variations, sizeof(int) ); 

bt- > totalExcluded = 0; 

IHBRange(bitset- > bitset, 1, (void ^) bt, RangeCallback ); 
UTL^MEM_FREE((char ♦) bt-> choices ); 
return bt-> totalExcluded; 

15 } 

inain(aigc,aigv) 
int aigc ; 
char *argvG ; 
20 { 

void *h ; 

char "^masterFileName = 

"/home7/fred/work/ADS/dserv/source/dbcsln des/TestData/Di_300 ; 
int noasterRecNumber - 1 ; 
25 int *bitset ; 

int size = (300*400 + 7)/8; 
int i ; 
int j ; 

int indexes[2] ; 
30 char hold[81]; 
#if 1 

if ( !(h = CS_PRDCT_BITSET^OPEN(argv[l],0))) 
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{ 

fiprintf(stderr, 'Unable to open the bitset file %s\n",argv[l]); 
exit ; 

} 

5 CS_PRDCT_BITSET_DUMP(h); 

CS_PRDCT_BITSET_CLOSE(h); 

#else 

if(!(h = 

CS_PRDCr_BrreEr_CREATE(niasterFileName,niasterRecNumber,NULL) ) ) 
10 { 

fyrintf(stderr, "Unable to create bitset for %s\n'',niaslerFileNanie); 
exit ; 

} 

CS_PRDCT_BITSET_WRrrE("Tcst.bs","MyProg",hANULL); 

15 indexcs[0] = 59 ; 

iridexes[l] » 129 ; 
CS_PRDCT_BITSET_SET_PRD_Brr(h,indexes); 

indexes[0] = 159 ; 

indexes! 1] = 241 ; 
20 CS_PRDCT_BrrSET_SET_PRD_Brr(h,indexes); 

CS_PRDCT_BITSET_WRITECTest2.bs","MyProg",h,0,NUIX); 

bitset = Cint *)UTL_MEM_CALLOC(sizc,sizeofCint)); 

bitset[5] » 49 ; 

bitset(l] = 99 ; 
25 CS_PRDCT_BITSET_SETBITS(h,bitset,-l); 

CS_PRDCr_BITSET_WRITECTestl.bs","MyProg",h,0,NULL); 

CS_PRDCT_BITSET_CLOSE(h) ; 

tfendif 
} 

30 jj^endif 
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A ppCTdix "S" 

*/ 

/* topam */ 

/*+c 

* This program detennines which csln ''products" are similar to an input 
10 * structure, where similarity occurs if the sum of differences in encoded 

* *'G>MFA*' fields is less than some threshold. 

The csln components are referenced in a master file with 

* one multiline record per cSLN. Record format is 

15 * Reaction class xxxx (where "Reaction class" is a lit^) 

* reaction_name 

* number_of_sv_sites 

* missing_bits_count 

* hashed_only_missing_bits_counl 
20 * core_filename 

* core_filenamejndex_of_core 

* fingeiprint_filename 

* ofFset_into_fingeiprint_file 

* first_sv_^fae_Xl 

25 * secod_sv_file_X2 (etc if more than two sv^sites) 

* NOTE ~ ALL subsequent entries in the master file whose Reaction class 

* matches the Reaction class of the record referenced by -index are also 

* processed! ("Matching" implies matching of possible other input symbols 
30 * to components of the Reaction class line,) 

* The input structure is read as encoded fields from stdin (or 

* a named file if provided), one field per line. There 
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* must be provided (by a SYBYL SPL script), in order: 



* •*number_of_sv_sites'' * "numberjof^field^types" fields describing die "core** of the 
query 

* "number_of_sv_sites*' - 1 sextets of relative coordinates of core attachment atoms 

* "number_of_sv_sites" * "number^of^field^types" fields describing the "side chains" 



15 



20 



* Options: 



10 * -master name 
* -bitset name 



'index number 



-reaction name 



-details name 



- name is the file with master file records 

- name is a result of an eariier search operation 
(use EITHER master or bitset) 

- vy^hich sequential record in master file to begin at 
OR of&et into bitset in a bitset file 

(default ^ 1) 

- records in master file to be processed must have this 
class name 

- if provided, records in master file to be processed 

must have any one of these tokens following its class name 



25 



-distance tan 



- tan is the overall similarity threshold 
(default is 90«0) 



* -cooweight cwt 



* weight of the core attachment coordinates, 
relative to fields 



30 



-nocore nocore 



- do not consider core topomer differences 
By default these are considered (required) 



-allcores allc - process all cores in the core file 
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By default nly one core (index in the master file) is processed 

- stop when niax hits are found (default infinity) 

- name of file with queries (defoult stdin) 

f 

- ^)ecifies the output file for the hit info 

This flag forces the display of all 
options 

/ 

15 fS^include <stdio.h> 

Anclude <signal.h> 

iftnclude <ctype.h > 

^include <unistd.h> 

^include <string.h> 
20 ^include <sys/stat.h> 

#include <math.h> 

#include "parsec^t.h** 

include 'ufl^str.h" 

#include "utl__mem.h* 
25 ^include "uU_file.h" 

^include "utl^math.h" 

#include "cLh" 

#include "ct_expr.h'' 

iKnclude "ctjiroto-h" 
30 ^define GoodExit 0 

^define ErrorExit 1 

#define Visua](s) { fjprintf s; } 



10 
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-ntaxhits max 



5 * "input filename 



-output filename 
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Static FILE 
static char 
static char 
static int 

5 static char 
static char 
void 
static int 
static FILE 

10 static int 
static char 
static FILE 
static char 
static char 

IS static int 
static char 
static char 
static double 
static double 

20 static char 
static int 
static char 
static double 



25 



static char 



static char 
. static char 
static FILE 
static char 
30 static char 
static char 
static double 
static double 
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♦OutputFile = 0; 
^utputFileBase; 
Ou^utFileName[200] ; 

nOutFiles = 0; /* number of output files */ 
♦MasterFile - 0; 
*BitsetFile = 0; 
♦bitset; 

MasterRecord = 1; 

*MasterFile_File; 
StartCore; 
*InputSource = 0; 

♦InputSourceFile; 

'^ReactionNeeded = 0; 

*ScratchDetails = 0; 

nDetail = 0; 

**ReactionDetails = 0; 

♦XWeights 0; 

♦RWeights = 0; 

CoreWeight; 

*FiddTypes = 0; 

nFType = 0; 

**FTypes = 0; 

♦FWeights = 0; 

**FOrder = 0; /* temp, for recording L- > R order of data 

in side chain SLN */ 

**FROrder = 0; 
♦Corefile = 0; 

*CoreFile_File; 
*CoreNow; 

**Xfile; 

**Xname; 

Distance = 90.0; 

CoreDistance = 0,0; 
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Static double 
static double 
static double 
static double 
5 static double 
static double 



15 



20 



static 
static 



int 
int 



static int 



10 static int 



static 
static 



tnt 
int 



static int 

static unsigned char 
static unsigned char 
static unsigned char 
static unsigned char 
static double 
static int 



static 
static 



mt 
int 



static int 



2S static int 



static 
static 
static 



static int 
30 static int 
static int 
static int 
static int 



int 
int 
int 
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DWdght = 1.0; 
Dist[16][16]; 
boundary[16]; 
CXcoords[6], CXdiffsq[6]; 
searched = 0.0; /* number searched */ 
combi = 1.0; /* number of side chain combos 
totnout = 0, nout = 0; /* number of products. */ 
*Good_Products = 0; /♦ product bit set */ 
♦Dead^Products = 0; /* forbidden product bit set */ 
nR; /* numb^ of R positions (usually 2) ♦/ 
*nX; /* number of product dimensions */ 

*Xct; /* used for indexing over all products 
**Xsize; /* bytes per field */ 

****X = 0; /* csln field (F x R x nX )*/ 
***Xin; /* target fields */ 
♦♦*Y; /♦ csln core fields (F x R ) */ 
♦♦♦Yin; /♦ target core fields ♦/ 
♦♦♦X2Y; /♦ distances between X and X' ♦/ 
nSym, /♦ number of symmetries in Uiis core ♦/ 

♦CoreSyms, /♦ flags for all matching core symmetries */ 
♦♦SymList; /* symmetry mappings ♦/ 
DefeultSym[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; 
ReverscSym(2] = {1. 0}; 
AppcndToOutputFile = 0; 
NoMorehitsPlease = 0; 
UserAborted; 
NoCorc = 0; 
AllCores = 0; 
CoreOK = 0; 
CorelsSame = 0; 
SideChainOnly = 0; 
SideChainsAreSame = 0; 
NotBitOutpul = 0; 
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Static char comHne[2048]; 
static struct ParseOptions OptionsQ = { 

{"mastcr% ParseOptString, &MasterFile, 

"Prefix for all input files" }, 
{"bitset", ParseOptString, &BitselFile, 

"Name is the file with bitset records" }, 
'distance", ParseOptDouble, &Distanoe, 
"Field similarity threshold (defouU 90.0)" }, 
'cooweight" , ParseOptDouble, ADWeight, 
10 "Core coord wt, relative to fields (default )" 

index" « ParseOptInt, &MasterRecord, 

"Which MasterRecord entry 1-n" }» 
'maxhits", ParseOptlnt, &NoMorehitsPlease, 
"Maximum number of hits before stopping" } , 
15 ("nocore", ParseOptlnt, feNoCore, 

"Use -nocore to override inclusion of the core differences" }, 
'ailcores" , ParseOpUnt, &AllCores, 

"Use -allcores to search all cores provided" }, 
'input", ParseOptString, &InputSource, 
20 "File from which queries will be read( default stdin). "}, 

output", ParseOptString, ftOutputFileBase. 

"File to which hit info will be written. "}, 
" notbits" , Pars^tlnt, &NotBitOutput, 

* 

"Use notbits to output as index ASCII instead of std bitset." }, 
25 ("reaction", ParseOptString, &ReactionNeeded, 

"Reaction class for topomer search. "}, 
details", ParseOptString, AScratchDetails, 

"Details further discriminating the reaction class. **}, 
'sidechain" , ParseOptlnt, ASideChainOnly , 
30 "Use sidechain to search for similiarity in a single sidechain only. "}, 

fieldtypes", ParseOptString. AFieldTypes. 

"Names of all field types (optional prefix =weight), space separated. Does 
CTOPS if none provided,"}, 
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{"xweights", ParseOptString, &XWdghts, 

"Weights of varying sites. Must be nR(+core?) individual wdghts present (if 

any)."}, 

}; 

5 int UBS.OUTPUT^MESSAGEO { return 0; } /♦ just for compiling OK */ 
int UIMS2_WRITE_PHOrrO0 { return 0; } 

int lowercase (s) char *s; {while (*s) { if isupper(*s) *s = tolower(*s); s+ + ;}} 
static int ParseArguments( argc» argv ) 

10 * 

* This function parses the command line arguments. 

* Returns: 1 on a successful command line parse, 0 otherwise. 
15 * Warnings: 

* Errors: 

* Author Date Description 

* G. B. Smith 02-09-93 Original Version 

int argc; 
25 char **argv; 

{ 

int nargs, 

noptions = sizeof( Options )/sizeof(Options[0]); 
nargs = UTL_PARSE_OPT( argc, argv, noptions, Options ); 
30 if( ! nargs ) goto SyntaxError; 

return 1; 
SyntaxError: 

fprintf( stderr, "Bad command line argument(s)\n" ); 
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return 0; 

} 

static int OpenOutputFUeQ 
5 * 

* Returns: I on sucesss, else 0 

♦/ 

{ 

10 char *insg; 

FILE *fp; 
Ou^utFile = stdout; 
if( OutputPileBase) 
{ 

15 MakeOutputPileNameO; 
/* 

We need to create output files under the ownership of the REAL user not the 
*♦ EFFECTIVE user. This only ^plies if setuid options are activated. 
*/ 

20 { 

struct Stat statBuff ; 
int uid ; 
int euid ; 

uid = getuidO ; 
25 euid geteuidQ; 

stat(Ou^utFileName, &statBufO; 

/♦ 

** There are two cases 
** (1) the file to ou^ut to exists 
30 ** Use the ownership of the current owner of the file or if you cant do that 
** do not do anything. 
** (2) The file is being created. 
** use the owner^ip of the REAL user. 
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•/ 

if ( access{OutputFileName. F^OK) = = 0 ) 
{ /* If the file exist and the real user is the owner of the file */ 
if ( statBuff.st_uid uid ) 
5 seteuid(uid); 

} 

else 

{ /* Create the file as the REAL user */ 
seteuid(uid); 

10 } 

} 

OutpulFile = fopen( OutputFileName, (AppendToOutputFile?"a":-wb")); 
if( lOutputFile ) { 

fprintfCstderr/Bror: Failed to open output file \''%s\"\n% 
15 Ouq)utFileName ); 

goto ErrorRetum; 

} 

} 

return 1; 
20 ErrorRetum: 
return 0; 

} 

static int WhatsTheDifferenceQ 

/* builds distance lookup table and initializes default symmetry data structure */ 
25 { 

• • • 

int 1, j; 

^define pow2(a) ( (a) * (a) ) 

f* the assignment of codes is based on the following (from gen_plsx): 
static fptcutoff[ 16] = {9999., 0.. 2., 4., 6,, 8., 10., 12., 
30 14.. 16., 18.. 20., 22.. 24.. 26., 30. }; 

*/ 

boundary [0] = 9999.; /* missing data ought never to occur. */ 
boundary[l] = -0.1 ; 
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for(i=2;i< 15;i++) 
boundaiyCi] = 2*i-3; 

boundary[lS] = 30.0; /* this is a steep curve with a cutoff at 30! */ 

fdr(i=0;i<16;i++) for O=0a<16a++) 
5 Dist[i]0] = pow2( boundary[i] - boundary^]); 

Distance *= Distance; /* want to test D*2 directly */ 

DWdght DWdght; 
allocate once for all conceivable symmetry leorderings 

if (!(SymUst = {int **) UTL_MEM_.ALLOC( si2eof( int *) * nR * (nR - 1) / 2) )) 
10 return 0; 

if (!(CoreSyms = Ont *) UTL_MEM_ALLXX:( sizeof( int ) * nR * (nR - 1) / 2) )) 
return 0; 

SymUst[ 0 ] = DefaultSym; 

Symlistf 1 ] = ReverseSym; 
IS return 1; 

} 

static int ReadAFieId( hex, index, pXP ) 

converts field from external (ASCii hex) format to internal */ 
char *hex; 
20 int ♦index; 

unsigned char **pXP; 

{ 

int words, hold; 
char next2[10], *nxhx; 
25 words = strlen( hex ) / 2; /* assuming 8-bit bytes */ 
if (! *index ) *index = words; 
if ( words ! = *index ) { 
/* bad field (most likely NULL), continue anyway */ 
*pXP = (unsigned char *) NULL; 
30 return 1; 

} 

if (!(*pXP = (unsigned char *) UTL_MEM_ALLOC(words) )) return 0; 
for (wonls=0, nxhx = hex; words < *index ; words++) { 
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meinq>y(next2, nxhx, 2); 
nxhx +=2; 

sscanf( next2, '•%2x". Ahold ); 

*(*pXP + words) = (unsigned char) hold; 

5 } 

return 1; 

1 

static int RetrievdnputO { 

/* reads Oie search pattern fields (generated by SYBYL script) */ 
10 int index, R, F; 
char *Une; 
double atofO; 

if (llnputSource) InputSourceFile = stdin; 
else if (! (InputSourceFile = fopen( InputSource, "r** ) )) { 
15 fprintf( stdout, "Could not open -input file %s\n", InputSource ); 

r^m 0; 

} 

if (!(Yin = (unsigned char •••) UTL_MEM_ALLOC( sizeof( unsigned char • nFType 
))) 

20 return 0; 

for (F = 0; F < nFType; F++) { 
if (!(Yin[ F ] = (unsigned char **) UTL_MEM_ALLOC( sizeof( unsigned char *) ♦ nR 

))) 

r^m 0; 

25 memset( Yin[F], 0, sizeof( unsigned char *) • nR ); 
} 

if (!NoCore) { 

/* field types are paired closest! */ 

for (index = 0; index < nR; index++) for (F = 0; F < nFType; F++) { 
30 /* a Field is on a single line, no parsing needed */ 

if (-1 = = UTL_SCAN_GETS( InputSoureeFiIe. "W", &line)) 
return 0; 

if (!ReadAField( line, Xsize( F ] + index, Yin( F ] + index )) return 0; 
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} 

for (ind^ = 0; index < 6; index ++) { 

if (-1 == UTL_SCAN_GErS( InputSouiceFUe, •\\", "r. &Une)) 

return 0; 

5 CXcoordst index ) = atof( line ); 

} 

if (!(Xin = (unsigned char *•*) UTL_MEM-ALLOC( sizeof( unsigned char * nFType 
))) 

return 0; 

10 for (F = 0;F < nFType; F++){ 

if (!(Xin[Fl = (unsigned char **) UTL_.MEM_ALLOC( sizeof{ unwgned char *) * nR 

))) 

return 0; 

niemset( Xin[F]. 0. si2eof( unsigned char *) * nR ); 

15 } 

for Cindex = 0; index < nR; index ++) for (F = 0; F < nFType; F++ ) { 

/* a Field is on a single line, no parsing needed */ 

if (-1 == UTL_SCAN_GETS( InputSourceFile, &Une)) 

return 0; 

20 if^IReadAFiddC line, Xsize[ F ] + index, Xin[ F ] + index )) return 0; 

} 

} 

fclose( InputSourceFile ); 
return 1; 
25 } 

static int InitCoreQ { 

/* readies core fde and its input arrays */ 
int R, i, F; 
char *foo; 

30 

if (! (CoreFile_File = fopen(CorefiIe,"r"))) { 

fprintf( stderr. "%s Core file not foundAn", Corefile ); 
return 0; 
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} 

i=0; 

while ( i < StartCore ) 
{ 

5 if ( -1 = = UTL_SCAN_GETS( CdrcFHe_File, 'W, &foo)) return 0; 
if (AllCores) break; 
i++; 

} 

Con^ow = UTL_STR_SAVE( foo ); 
10 /* initialize core data structures */ 

if (!(Y = (unsigned char •**) UTL_MEM_ALLOC( sizeof( unsigned char **) • 
nFType)) ) 

return 0; 

for (F = 0; F < nFType; F++) { 
15 if (!(Y(FJ = (unsigned char **) UTL_MEM_ALLOC( si2eof( unsigned char •) • nR)) ) 

return 0; 
for(R = 0; R < nR; R++) 

if (!( •( (Y(F]) + R ) = (unsigned char •) UTL_MEM_ALLOC( sizeof( unsigned 
char ) 

20 * (*Xslze[F ]) + R ) )) return 0; 

) 

return 1; 

} 

int CountUnesQ 
25 { 

int i; 

char *foo; 

/* note that CountLines returns one less than the actual number */ 
i=0; 

30 while ( -1 != UTL_SCAN_GETS( InputSourceFUe, "W", &foo)) i-f + ; 
rewind(InputSourceFiIe); 
return i; 



wo 97/27559 PCT/US97/01491 

616 

} 

Static int initXarrays 0 

{ 

intF, i; 

5 if (!(Xfile = (char *♦) UTL_MEM_ALLOC( sireof( char* ) * nR ))) return 0; 

if (!(Xname = (char **) UTL_MEM_ALLOC( si2eof( char* ) * nR ))) letum 0; 

if (!(nX = (int*) UTL_MEM_ALLOC( si2eof( int ) * nR ))) return 0; 

if (!(Xct = (int*) UTL_MEM_ALLOC( sixeof( int ) • nR ))) xetam 0; 

for 0 = 0; i < nR; i++) { XfileDl = 0; Xnamepl = 0; nXfi] «= 0; Xctfi] = 0; } 
10 if (!(X = (unsigned char ****) in'L_MEM_ALLOC( sizeof( unsigned char ***) • 

nFType)) ) 

return 0; 

for (F = 0; F < nFType; F++) { 

if (!(X[F] = (unsigned char •••) UTL_MEM_ALLOC( sizeof( unsigned char •*) 

15 * nR)) ) 

return 0; 

memset( X[F], 0, sizeof( unsigned char **) * nR ); 

} 

if ('(Xsize = (int ••) UTL_MEM_ALLOC( azeof( int * ) * nFType ))) return 0; 
20 for (F = 0; F < nFType; F++) { 

if (!(Xsize[Fl = (int *) UTL_MEM_ALLOC( sizeof( int ) * nR ))) return 0; 
for Ci 0; i < nR; i++) *(Xsize[F] + i) = 0; 

} 

return 1; 

25 } 

static int initXfiles( i, SideChainsAreSame ) 

/* reads X file data (reactant descriptors from 2nd comment line of X file ) */ 
int i, ^SideChainsAreSame; 

{ 

30 char *foo, *pch; 

if ( -1 == UTL_SCAN_GETS( MasterFile_File, "W", &foo)) return 0; 
if (XfileDl) { 
/* if this X file is same as last, nothing to do */ 
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if (!sticinp( Xfile[ i ], foo ) ) letum 1; 
*SideChainsAreSaine = FALSE; 
UTL_MEM_FREE( Xfilep) ); 

} 

5 Xfile( i ] = UTL_STR_SAVE(foo); 

if (! (InputSouiceFile = fopen(Xfile{i]."r"))) { 

fyriiitf( stdout, "Could not open variation file %s\n", XfikQ] ); 
return 0; 

} 

10 /* leading COMMENT lines to get USER_NAME value for matching */ 

if ( -1 UTL_SCAN_GETS( InputSourceFile, 'W", "", &foo)) return 0; 

if (-1 == UTL_SCAN_GETS( InputSourceFile, "W", "", &foo)) return 0; 

if (XnamepJ) UTL_MEM_FREE( Xname(i] ); 

Xname[i] =0; 
15 pch = strstr( foo, "USER_NAME=" ); 

pch += strlen( "USER_NAME=" ); 

if (!(Xnanieri] = UTL_STR_SAVE( pch ) )) return 0; 

rclose( InputSourceFile ); 
. return 1; 

20 } 

int StartFromBitsetO 
{ 

void *CS_PRDCT_BITSET_OPEN0; 

if ( !( bitset = CS_PRDCT_BrrSET_OPEN( BitsetFile, MasterRecord))) return 0; 

25 

if ( !RetrieveMasterFiIeFromBitset(bitset, 

&MasterFile, 

&MasterRecord, /*in master file*/ 
0. 

30 0. 

0. 
0. 
0, 
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0. 
0, 
0, 
0. 
0, 
0. 
0. 
0. 
0. 
0, 
0. 

0 ) ) return 0; 



return 1; 



} 

/* 1/7/97 DEP: allow reading of bitsets. Since the masterfile must be 

read in any case, the bitset only generates "Dead^Products" */ 
int InitMasterFileO 

I* Read the master file record which is requested; 

failure if it does not match the input line info ^/ 

{ 

int i, d, size, rxMatch, inc, ns, *Sym; 
char *foo; 
int *fooi; 

if (BitsetFUe && ! StartFromBitsetQ) return 0; 
if (! (MastcrFilc_File = fopen{MasterFile,*r"))) { 

fprintf( stdout, (master file) not foundAn", MasterFile ); 

return 0; 

} 

rxMatch = irx = 0; 
while ( irxMatch) { 

if ( -1 == UTL_SCAN__GETS( MasterFile_File. "W", &foo)) return 0; 
if ( strstr(foo,*'Reaction class ")) { 
irx++; 



t 
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if (bitset && iix > Mast^Record) return 0; /* the right record did not match "^Z 
I* preliminary match if (1) Reaction Needed matches and (2) 

NOjX)re must be present if NoCore is TRUE (or vice versa) */ 
rxMatch = ( irx > = MasterRecord && strstr( foo, ReactionNeeded ) 
5 && ((INoCore && !strstr( foo, "NO^core" ) ) 

1 1 ( NoCore && strstr( foo, "NO_core" ) ) ) ); 

} 

/* if preliminary match, check rest of .mf record — first M reactants */ 
if (rxMatch) { 

10 /* skip name, record / compare number of reagents V 

if ( -1 == UTL_SCAN_GETS( MasterFile_File, "W", &foo)) return 0; 
if ( -1 UTL_SCAN_GErS( MasterFile_File, "W", &foo)) return 0; 
if ( ! lITL_STR_ATOI(foo, &d) ) return 0; 

if (!nR) { 

IS if (SideChaiiiOnly && d != 1) { 

fiprintf( stdout, "Side Chain only but .mf file references more than 

one side chainAn" ); 

return 0; 

} 

20 nR = d; 

if (!initXarraysO) return 0; 

} 

rxMatch = nR = = d; 

} 

25 if (rxMatch) { 

/* skip fgpt stuff, record core and side chain file stuff **/ 

if ( -1 == UTL_SCAN_GETS( MasterFile_FiIe, "W", &foo)) return 0; 

if ( -1 == UTL_SCAN_GETS( MasterFile_File, "W", &foo)) return 0; 

if ( -1 == UTL_SCAN_GETS( MasterFile_File, "W", &foo)) return 0; 
30 if (Corefile) UTL_MEM_FREE( Corefile ); 

Cbrefile = UTL_STR_SAVE(foo); 

if ( -1 == UTL_SCAN_GETS( MasterFile_File, "W", &foo)) return 0; 
if ( ! UTL_STR_ATOI(foo, ftStartCore ) ) return 0; 
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if ( -1 == UTL_SCAN_GETS( MasterFile_File, "W", &foo)) return 0; 
if ( -1 = = UTL_SCAN_GETS( Mast«File_FiIe, "W", &foo)) r^um 0; 
for (i = 0; i < nR; i4-+) if (!initXfiles( i, &SideChainsAieSame ) ) return 0; 

} 

S } /* read .mf file until we have a matching reaction *l 
return 1; 

} 

Static int ReadXsQ { 

1"^ reads all topmer fields from all current Xn files 
10 int R, F, i, n, ns, realloc, Fd; 
char *CTOPS, *line, *fptr; 
double *dp, **sdptr; 
unsigned char **uc; 
combi = 1:0; 

IS skip the following lengthy stuff if side chains are all the same */ 
if (SideChainsAreSame && X[0]) return 1 ; 
for (R = 0; R < nR; R++) { 
if {! (InputSourceFile = fopen(Xfile[R]/r"))) return 0; 

n = CountLinesQ; 
20 realloc = n != nX[R]; 

combi (double) n; 
if (realloc && nX[ R ] ) 

for (F = 0; F < nFType; F++) { 
for 0 = 0; i < nX[R]; i + +) UTL^MEM_FREE( *(X[F1 + R) + i ); 
25 UTL_MEM_FREE( X[F1 + R ); 

} 

nXt R ] = n; 

if (realloc) for (F = 0; F < nFType; F++) 
30 if (!(*(X[F] + R) = (unsigned char **) 

UTL_MEM_ALLCX:( sizeof( unsigned char *) ♦ nX[R]) )) return 0; 
/* starts reading at line 2! */ 
for (i = 0; i < nX[R]; i-f+) { 
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if (-1 == UTL_SCAN_GETS( InputSourceFile, "W", Aline)) 
goto error; 
/• generate info for left-to-iight read */ 

for (F = 0; F < nFType; F++) FOrderf F J = strstr( line, FTypes[ F ] ); 
S do{ 

for (Fd = -1, F = 0, fytr = 0; F < nFType; F++) 

if (FOnter[F) && (!fptr 1 1 FOider[F] < ^tr))-{liptr = FOrder(iq; Fd = 

F;} 

if (fptr) { 

10 fptr + = strlen( FTypes[ Fd ] ) + 1; /*skipping "CTOPS=" */ 

UTL_SCAN_TOKENIZE(fptr. ' ; • , 'W') ; 
UTL_SCAN_TOKENIZE(fptr. • > • , 'W) ; 

if (!ReadAField( fptr, Xsii^L Fd ] + R. *(X[Fd] + R) + i )) goto error; 
FOrderC Fd ] = 0; 

15 } 

} while (fptr); 

} 

fclose( InputSourceFile ); 
/* set up X - Y distance vectors */ 
20 if (realloc) for (F = 0; F < nFType; F++) for (ns = 0; ns < nSym; ns+ +) { 
sdptr = X2Y(nsl; 

if (sdptrfR]) UTL_MEM_FREE( sdptr[R] ); 

if (!( sdptrl R ] = (double *) UTL_MEM_ALLOC( sizeof( double ) • nX[R] ) )) 

* 

return 0; 

25 for (i = 0. dp = sdptr[R]; i < nX(R]; i++) *dp++ = -1.0; 

} 

] 

return 1; 
error: 

30 fprintf( stdout, "topsim failed reading line %d of %sAnLast line read was %sAn", 
i. Xfile[R], line ); 
return 0; 
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char **ParseQuotedString( SDetails, nDetail, Wdghts ) 
char *SDetails; 
int *nDetail; 
double **Weights; 
5 { 

char *pch, **ppch. *wch, **Details; 
inti; 

double *wt; 

/* first trim string to remove leading/trailing spaces and quotes */ 
10 while (*SDetails == 1 1 *SDetails == SI>etails+ + ; 
pch - SDetails + strlen( SDetails ) - 1; 
while (*pch == 1 1 *pch == ' ') *pch- = •\0*; 
/* each space is token delimiter 

for (i = 0, pch = SDetaib; ♦pch; pch++) 
15 if (*pch = = • i-f-+; 

♦nDetail = i+1; 

if (!(DetaUs = (char **) UTL_MEM^ALLOC( sizeof( char * ) * (*nDetail) ) )) 

return 0; 
if (Weights) { 

20 if_(! (•Wrights = (double *) UTL_MEM_AliOC( sizeof( double ) • (*nDetail) ) )) 

return 0; 

wt = *Weights; 

} . 

pch = SDetails; 
25 if(*i)ch == •"')pch++; 

for (i = 0, ppch = Details; i < •nDetJul; i+ + , ppch++) { 
UTL_SCAN_TOKENIZE(pch,' '.'W); 
*ppch = UTL_STR_SAVE( pch ); 
if (Wdghts) { 
30 /• note, the copy is now being modified */ 

if {(wch = strstrC *ppch. "="))){ 

if (!isweight( wch + 1 )) return FALSE; 
*wt = atof( wch + 1 ); 
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*wdi = •XO'; 

} 

else ♦wt = 1 .0; 
wt++; 

5 } 

pch += strlen( pch ) + 1; 

} 

ietum( Details ); 

} 

10 int iswe]ght( s ) 

returns true if value is a positive decimal value *l 
char *s ; 

{ 

char *c; 

15 for (c = s; *c; C++) if (!isdigit( *c )&&( *c !=*.')) { 

fjprintf( stdout, "Bad weight value: %s. AbortingAn", s ); 
retum( FALSE ); 

} 

retum( TRUE ); 

20 } 

int ParseRxnO 

/* parses complex input descriptions 
{ 

char **ParseQuotedStringO, **scratch; 
25 int nRW. i, nX; 
double wtsum; 

/* parse field type information or set up standard (steric) type only */ 
if (FieldTypes) { 

if (!(FTypes = ParseQuotedString( FieldTypes, &nFType, &FWeights ) )) return 0; 
30 /* scale to average weight of unity */ 

for 0 = 0, wtsum = 0.0; i < nFType; i++) wtsum += FWeights[i]; 
wtsum /= (double) nFType; 

for (i - 0; i < nFType; i++) FWeights[ i ] /= wtsum; 
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} 

else { 
nFType = 1; 

if (!( FTypes = (char *•) UTL_MEM_ALLOC( sizeof( char • ) ) )) return 0; 
5 if (!( *FTypes = UTL_STR_SAVE( "CTOPS" ) )) return 0; 

if (!( FWdghts = (double *) UTL_MEM_AUjOC( sizeof( double ) ) )) return 0; 
*FWeights-= 1.0; 

} 

if (!(FOfder = (char **) IJI1,_MEM_AIIXX:( sizeof(char *) * nFType ) )) return 0; 
10 /* parse any reaction type information present */ 
nR = 0; 

if (SideChainOnly) { 
NoCore = TRUE; 
return 1; 

15 } 

if (IReactionNeeded) return 0; 
if (ScratchDetails) { 

if (!(ReactionDetails = ParseQuotedString( ScratchDetails, &nDetaiI, NIL ) )) return 0; 
nR = nDetail; 
20 if (linitXarraysO) letum 0; 

} 

if (!(FROrder = (char **) UTL_MEM_ALLOC( si2eof(char •) • nFType * nR ) )) return 
0; 

/* parse any us^-provided variation weighting */ 
25 CoreWeight = 1.0; 

if (!( RWeights = (double *) UTL_MEM_ALLOC( sizeof( double ) * nR ) )) return 0; 
if (XWeights) { 

if ('(scratch = ParseQuotedString( XWeights, &nRW, NIL ) )) return 0; 
/* scratch will juA be unfreed memory */ 
30 nX = nR + (NoCorc ? 0 : 1); 

if (nRW != nX ) { 

^rintf( stdout, "Mismatch between count of xwdghts (%d) and needed 
(»d).\n", nRW, nX ); 
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return 0; 

} 

for (i = 0, wtsum = 0,0; i < nR; i++) if (!isweight( scratch[ i ] )) return 
FALSE; 
S else { 

RWeights[ i ] = atof( scratch[ i ] ); 
wtsum += RWeight$[ i J; 

} 

if (INoCore) if (!iswdght( scratch[ nR ])) return FALSE; 
10 else { 

CoreWeight = atof( scratch[ nR ] ); 
wtsum += CoreWeight; 

} 

wtsum /= <double) nX; 
15 for 0 = 0; i < nR; i++ ) RWdghtst i ] /= wtsum; 

if (INoCore) CoreWeight /= wtsum; 

} 

else for (\ = 0; i < nR; i++) RWcights( i ] = 1.0; 
return 1; 

20 } 

int ReadEverythingO 
{ 

if (!MasterFile && !BitsetFile) return 0; 
if (!ParseRxnO) return 0; 
25 setbits_nbitsJnitO; 

if (IlnitMasterFileQ ) return 0; 
if (IlnitCoreO ) return 0; 

if (IWhatsTheDifferwiceO) return 0; 
if (IRetrievelnputO ) return 0; 
30 return I; 

static int InitSym( nsym ) 
int nsym; 



wo 97/27559 PCT/US97A)149I 

626 

{ 

I* sets up symmetries to consider as described for core 
ONLY 2 leactants considered for now! 
assumes that CoieNow is pointing to the appropriate structure *l 
S int i, F, maxsym; 
double **x2y; 

/* get symmetry firom currentcore molecule if not supplied by caller-*/ 
nSym = nsym; 
if ( InSym ) { 

10 if ((!strstr( CorcNow, -SYM=" )) 1 1 (strstr(CoreNow, "SYM-O-)) ) nSym = i; 
if {strstr<CoreNow, "SYM=r)) nSym = 2; 
/* add more categories here */ 

} 

for 0 = 0; i<nSyin; i++) CoreSyins( i ] = 1; 
IS /* allocate distance arrays to max possible for nR */ 
if (!X2Y) { 

for (maxsym = 1, i = 0; i < nR; i++) maxsym •= (i+1); 
if (!(X2Y = (double *•*) UTL_MEM_ALLOC( sizeof( double •*) * nFType ) )) 
return 0; 

20 for (i = 0; i < maxsym; i=M^) { 

if (!(X2Y[il = (double**) UTL_MEM_ALLOC( sizeof( double *) * nR) )) return 0; 
memset( X2YriJ, 0. sizeof( double *) • nR ); 

} 

} 

25 return nSym; 
} 

int ReadCoreTopomers( CoreOK ) 
int *CoreOK; 

{ 

30 /* returns 1 unless fatal error. Sets CoreOK to TRUE if this mf entry is OK 
Also sets up symmetry considerations (which are core structure dependent), 
assumes that CoreNow is pointing to the appropriate structure */ 
int foo, i, R, F, Fd, Rd, rf» skipcore, ns, *Sym; 
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dm Iabel[lS], *iixTop, •cstart, *fptri 

char *cnainesn = {"NX=","NY=","NZ=","CX=","CY=","CZ="}; 
double coo; 
double atofO; 
5 skipcore = NoCore; 

/* always consider both matches iff no core */ 
if(skipcore) InitSyni( nR ); 

else skipcore - ! InilSyni(0); 
/* dieck for any symmetry-allowed ncn by ixn match of all rcactant name "details" */ 
10 for (ns = 0; ns < nSym; ns++) if (CoreSyms[ns]) { 
Sym = *(SymUst + ns); 
♦CoreOK = TRUE; 
if (tSideChainOnly) 
for 0 = 0; i < nR && *C0reOK; i++) 
15 if (!strstr( ReactionDetalls( Sym[ i ] ], Xnamet i ] )) 

*CoreOK = FALSE; 
if (*CoreOK) break; 

} 

if (skipcore i | CorelsSame ] | '(•CoteOK )) return 1; 
20 nxTop = CoreNow; 

/* read left-to-right, so record all starting points; 
assume that coords are bunched and appear only once 

for (F = 0; F < nFType; F++) for (R = 0; R < nR; R+-t-) { 
25 sprintf( label, "%s%d", FTypes[ F ], R + 1 ); 

if (!( FROrderf F • nR -I- R J = strstr( nxTop, label ) ) ) { 
/* some requested datum missing; then this core entry has no topomer data; use it •/ 
•CoreOK = 0; 
return 1; 

30 } 
} 

cstart = strstr( nxTop, cnames[ 0 ] ); 
do { 
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/* find next datum in lefl-to-right ider *l 

for (F = 0, fptr = 0; F < nPType; F++) for (R = 0; R < nR; R++) { 
rf » F * nR + R; 

if (FROrd^rQ && (llptr 1 1 FROnlei[rq < Ijptr)) {Ijptr = FROideifrf]; Fd = F; 
5 Rd = R;} 
} 

if (cstart && (!^tr | j cstart < iptr)) {fptr = cstart; Fd = -1; } 
if (iiptr) { 

f* unpack next frieoe of data to fumpex location *f 
10 if (Fd > = 0) { 

/*" then datum is a field */ 

fptr + = strlen( FTypesI Fd ] ) + 2; /•skipping "CTOPn=" */ 
UTL_SCAN_TOKENIZE(fiptr,';','\\'); 
UTL_SCAN_TOKENIZE(fiptr,* > * ,*\\'); 
15 if (!ReadAFid[d( fptr, Xsi2e( Fd ] + Rd, Y{FdJ + Rd )) return 0; 

FROidcrt Fd * nR + Rd ) = 0; 

} 

else { 

for(t = 0; i < 6; i+-h) { 
20 /* the next data are coordinates 

Z'*' read coords, save as distances squared 

cstart = strstr( cstart, cnamesp]); 
if (fcstart) { 
/* *en this core entry has no topomer data */ 
25 *CoreOK = 0; 

return 1; 

} 

cstart -(-= strlen(cnames[i]); 
UTL_SCAN_TOKENIZE(cstart.';','\V); 
30 UTL_SCAN_TOKENIZE(cstatt.' > ' , 'W'); 

coo = CXcoordsT i ] - atof(cstart); 
CXdiffsq[ i ] = coo • coo * DWeight; 
cstart += strlen( cstart ) + 1; 
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} 

cstart = 0; 

} 

} 

} while (fjptr); 
return 1; 

} 

int CoreMatches( CoieOK ) 
int ♦CoieOK; 

{ 

/* returns 1 unless fatal error. Sets CoreOK to FALSE if no compound ha^ng 
this core can possibly match */ 
int F, R, i, ns, *Xct, ct; 
double sqrtO, totd, xount, cdiff; 
unsigned char *ptr, *qtr; 
if (NoCore 1 1 CorelsSame) { 
*CoreOK = TRUE; 
return 1; 

} 

/♦ can check for coordinate discrepancy fastf */ 
for (i = 0, cdiff = 0.0; i < 6; i++) cdiff -h= CXdiffsqp]; 
if (cdiff > Distance) { 

*CoreOK = FALSE; 

return 1; 

} 

for (F = 0. totd ,= cdiff; F < nFType; F++) for (R = 0; R < nR; R++) { 
if (totd > Distance) break; 
ptr = (unsigned char *) *(Y[ F ] + R); 
qtr = (unsigned char *) *(Yin[F] + R); 
if Optr 1 1 !qtr) xount = 999999.0; 

else for(xount=0.0, i=0; i < *(XsizeIFl -h R); i++, ptr++, qtr++) 
xount +=Dist[ *ptr&0x0F ][ »qtr&0x0F J 

+ Dist[ (*ptr & OxFO) > > 4]( (*qtr & OxFO) > > 4] ; 
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totd += xount * FWeights[ F J / (double) nR; 

} 

CoreDistance = totd * CoreWaghi; 
♦CoreOK = totd < = Distance; 
5 return 1; 

} 

int FindXMatches 0 { 
inl R, F, i, ns, ct, *Sym, size, what ; 
double totd, d, **sdptr, *dptr, xount; 
10 unsigned char *ptr, *qtr; 

/* reinitialize indices for permuting over all products - 

code is general for any number of variable positions */ 
for (i « 0; i < nR; i++) Xct[i] =0; 

AddressSize(nR, nX, &size); 
15 size = (size + 31 )/32 * 4; 

if (bitset) /* assumes actualisizes matches current sizes!*/ 

{ 

if (!(Dead_Products = fmt *) UTL_MEM_ALLOC(size))) return 0; 
CS_PRDCrr_BITSETjrO_RAW( bitset. Dead_Products, 0); 
20 not_here(Dead_Products,size ); 

} 

while ( TRUE ) { 
/* exit elsewhere when all products are enummted */ 
IndexesToAddress( nR, nX, &what, Xct); 
25 if (Dead_^Products && 

Testl>ead(0, what) ) goto tupledone; /* not doing Oiis one! ♦/ 

for (ns = 0; ns < nSym; ns++) if (CoreSyms[nsl) { 
/* process all symmetries of current side chain combo */ 
Sym = *(SymList + ns); 
30 sdptr = •(X2Y + ns); 

for (R 0, totd = CoreDistance; R < nR; R+-f ) { 
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if (totd > Distance) break; 
/* compute next distance if not already done - DEP knows how this works! */ 
dptr = (•(sdptr + R )+Xct[R]); 
if ((*dptr) < 0.0) for (F = 0; F < nPT^pe; F++ ) { 
5 ptr = (unsigned char *) *( *(XfF] + R) + Xct[ R ]); 

qtr = (unsigned char *) *OQn{ F ] + Symt R ]) ; 
if (!ptr j I !qtr) {*dptr = 999999.0; break;} 
dse { 

for(xount=0.0, i=0; i < *(Xaze(F] + R); i++, ptr++, qtr++) 
10 xount + = Dist[ *ptr & OxOF ][ *qtr & OxOF ] 

+ Distf (•ptr & OxFO) > > 4][ (•qtr & OxFO) > > 4] ; 
•dptr + = xount • FWeights[ F ]; 

} 

} 

15 totd + = •dptr • RWdights[ R ]; 

} 

/• if hit, write it out */ 

if (totd < = Distance) { 
if (NotBitOutput 1 1 nR ! = 2) { 
20 /* ASCn index form of output - also REQUIRED if more than 2 varying elements •/ 

if (lOutputFile && lOpenOuQjutFileO ) return 0; 
for (R = 0; R < nR; R++) fprintf( OutputFile, "%6d ", XctfR] + 1 ); 
fiprintf( OutputFile. "%6d%8.2f%8.2f%8.2An", StartCore, 

sqrt(totd), sqrt(CoreDistance), sqrt(totd - CoreDistance) ); 

25 } 

else { 

if (!Good_Products ) { 

if (!(Good_Products = (int *) UTL_MEM_ALLOC( size ) )) return 

0; 

30 memset( Good Products, 0, size ); 

I 

FlagPn)duct(Good_Products, 0, 0, what ); 

} 
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nout++; 

if (NoMorehitsPlease SlSl nout > = NoMorehitsPlease) goto done; 
/* output only one accq;>table symmetry per product */ 
goto tupledone; 
5 } 
} 

/* goierate next index tuple, AKA candidate product */ 
tupledone: 

ct = nR - 1; 
10 while { TRUE ) { 

Xcttct 1 ++; 

if (Xct[ ct 1 < nX[ ct ]) break; 
/* finished when first index exceeds limit — the other exit */ 
if (ct == 0) goto done; 
15 Xct[ ct ] = 0; 

ct-; 

} 

} 

done: 

20 /* output any products from this dataset V 
if (NotBitOutput 1 1 nR !^ 2) { 
if (OutputFile) fclose(OutputFile); 
OutputFile = 0; 

} 

25 else if (Good_Products) { 
WriteStdFileO; 

UTL_MEM_FREE( Good Products ); 
Good_Products = (int*) 0; 

} 

30 return 1; 
} 

int MakeOutputPileNameO { 

/'^ a run may produce multiple files, and the user probably can*t tell, 
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so append a sequence J to subsequent base names 
if (InOutFiles) { 

sprintf( OutputFileName, -%s-, OutputFileBase ); 
/* get base nanoe ready for next call */ 
5 strtok( Ou^utFileBase, ); 

} 

else sprintf( OutputFileName, *%s_%d.%s", OutputFileBase, 

nOutFiles, OutputFileBase + strla)(OutputFileBase) + 1 ); 
nOutFiIes-h+; 

10 } 

int WriteStdFileO { 
/* writes out the bit set of products */ 
int sizes[2]; 
int allocSizes[2] ; 
15 int numInSites[2] ; 
void ^compressed ; 
int total ; 

sizes[0] = nX[0] ; 
sizes[l] = nX[l] ; 
20 numInSites[0] = numInSites[ll = -1 ; 

allocSizes[01 = allocSizesII] -1 ; 
compressed = NIL; 
total = 0; 

MakeOutputFileNamcO ; 
25 WriteOutCheckPointFile( OutputFileName. 

MasterFile, 

MasterRecord, 

comline, 

Good_Products, 
30 0, 

2, 

sizes, 
allocSizes, 



I 
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nout, 

numlnSites, 
total, 

compressed); 

5 } 

int ReadNextCore( SideChainsAreSame, CorelsSame ) 
int ^SideChainsAreSame; 
int ^CorelsSame; 

{ 

10 continues reading through master file for more matching Reaction Classes. 

If the side chain files have the same name, can skip rebuild of X diffs 
diar *foo; 

int i, d. rxMatch = 0, val; 
if (AUCores) { 

15 if ( .1 ^= UTL_SCAN_GETS( CoreFile_File, "W". &foo)) fclose( 

CoreFile^Rle ); 
else { 

/* get next core ready and quit */ 

CoreNow = UTL_STR_SAVE(foo); 
20 *SideChainsAreSame = TRUE; 

StartCore+ + ; 
return 1; 

> 

} 

25 while ( !rxMatch ) { 

if ( -1 == UTL_,SCAN_GETS( MasterFile_File. "W". -#^ &foo)) return 0; 
/* preliminary match if (1) Reaction Needed matches and (2) 

NO_core must be present if NoCore is TRUE (or vice versa) */ 
rxMatch = ( strstr(foo/Reaction class ") && strstr(foo, ReactionNeeded) 
30 && ((! NoCore && !strstr( foo, -NO_core" ) ) 

j I ( NoCore && strstr( foo, ''NO^core" ) ) ) ); 
if (feof(MasterFile_File)) return 0; 
/* skip name, record number of reagents */ 
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if (rxMatch) { 

if ( -1 == UTL_SCAN_GErS( MasterFile_File, "W", &foo)) return 0; 
if ( -1 == UTL_SCAN_GETS( MasterFile_File, "W", &foo)) return 0; 
if( ! UTL_STR_ATOI(foo, Aval) ) return 0; 
if (val ! = nR) ncMatch = 0; 

} 

if (rxMatch) { 

/* skip fgpt stuff, record core and dde chain file stuff */ 

if ( -1 == UTL_SCAN_GETS( MasterFUe_File, "W", &foo)) return 0; 
if ( -1 == UTL_SCAN_GErS( MasterPile_File, "W", &foo)) return 0; 
if ( -1 == UTL_SCAN_GETS( MasterFile_File. "W, &foo)) return 0; 
*CoreIsSame = TRUE; 
if (strcmp( too, Corefile )) { 

^CorelsSame = FALSE; 

UTL_MEM_FREE( Corefile ); 

Corefile = UTL_STR_SAVE(foo); 

} 

if ( -1 == UTL_SCAN_GErS( MasterFile_File. "W", &foo)) return 0; 
if ( ! UTL_STR_ATOI(foo. Aval ) ) return 0; 

if (val ! = StartCorc ) *CoreIsSame = FALSE; 
StartCore = val; 
if (! *CorelsSame ) { 

if (CoreFile_FiIe) fclose(CoreFiIe_File); 

if (! (CoreFile_File = fopen(Corcfile,"r"))) return 0; 

i=0; 

while ( i < StartCore ) { 

if ( -1 = = UTL_SCAN_GETS( InputSourceFile. "W". &foo)) return 0; 

if (AllCOies) break; 

i++; 

} 

CoreNow = UTL_STR_SAVE( foo ); 
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if ( -1 == UTL_SCAN_GErS( MasterFile_File, "W", &foo)) return 0; 
if ( -1 == UTL_SCAN_GETS( MasterFile_File, "W", &foo)) return 0; 
*SideChainsAreSame TRUE; 

for (i » 0; i < nR; i4--f ) if (!initXfiles( i, SideChainsAieSame ) ) return 0; 

5 } 
} 

return 1; 

} 

/* this belongs in the utl module, actually */ 
10 int MakeComLine( char *line, int len, int argc, char **argv) 

{ 

int i, nch, totch = 0; 
sprintf(line/%s ",argv[0]); 
for(i=l;i<argc ScSl totch < = len;i++) 

15 { 

nch = strleh(iine); 
line += nch; 
totch += nch; 

if (totch < len ) sprintfOine/%s ",argv[il); 

20 } 

} 

* 

int CheclcPointProgram(void) { 
fprinlf(stdeiT/CheckPointProgramO is a lonely stub in topsim,c!\n"); 

} 

25 int niain( argc» argv ) 
int argc; 
char **argv; 

{ 

int processing; 
30 if( !ParseArguments( argc, argv ) ) 

goto SyntaxError; 
MakeComLine( comline, 2048, argc, argv ); 
if (IReadEverythtngO) goto FailureExit; 
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processing = 1; 
while (processing) { 

if (!ReadCoFeTopomers( ACoreOK )) goto FailureExit; 

if (CoreOK && !CoreMatches( &CoreOK )) goto FailureExit; 
5 if (CoreOK && IReadXsQ) goto FailureExit; 

searched + = combi; 

if (CoreCMCAA IFindXMatchesO) goto FailureExit; 
totnout + = nout; 
nout = 0; 

10 processing = ReadNextCore( ASideChainsAreSame, fcCorelsSame ) && 

(INoMorehitsPlease | } nout < NoMorehitsPIease); 

} 

fprintf(stdout, "Normal Exit: %d of %{ are neighbors\n", totnout, searched ); 
UserAborted ? exit(EnorExit) : exiKGoodExit); 
15 SyntaxError: 
exitd); 
FailureExit: 

exit(ErrorExit); 

} 

20 /* 

numVariations is number of dimensions Y_01, Y_02 etc (normally 2) 
dsize contains the nY^Ol, hY_02 etc 
address is the bit number (0 to N-1) 

choices will contain the offsets (0 based) of Y_01, Y_02 etc. on return 

25 */ 

int AddressToIndexesCmt numVariations, inl *allPtr, int address, int *chPtr ) 
{ 

for ( chPtr += (numVariations - 1 ), allPtr += (numVariations - 1) ; 
numVariations— ; 
30 allPtr--, chPtr-) 

{ 

♦chPtr = address % *allPtr; 
address = address / *allPtr; 



1 
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} 

return 1; 

} 

int IndexesToAddressCnt numVariadons, int '*^lPtr, int ^dress, int "^nd) 

5 { 

int — i ; 

int indx = 0 ; 

for (i=0;i<numVariations;i++) 
indx += indx * allPtrfi] + ind[il; 
10 ^address = indx; 

return 1 ; 

} 

int Addres$Size(int numVariations, int ^allPtr, int *size) 
{ 

15 for ( *size = 1 ; -num Variations; allPUr++) ♦aze ♦= *allPtr; 
return 1; 

} 

int not_here( what, nbytes ) 

unsigned char '^what; 
20 int nbytes; 

{ 

for ( ; nbytes; -nbytes) *what++ = -♦what; 
ittum 1; 

} 
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Appendix "T" 

©macro FragCTOPS ChSp 
ft 

# Bitiy point for Web-based topomeric search initialization 

§ sets up a set of topomeric searches, by identifying topomer data arising 
from 

10 # substructuial searching of SLN pattems found in topfrag.tbl to the 

# query structure and generating the topomeric data and search command file 
entry 

# for all resulting fragmentations of the query structure. 

15 # The Query SLN(s) are assumed to be referenced by $CS_^QUERY; 

# The file(s) to be searched are referenced by $CS_DATASET (space 
separated) 

# The directory where command files are to be writt^ is $CS_TEMPDIR 

# The GUI parameters are to be in $CS_PARAMETERS 
20 # The name of the output file(s) is to be in SCS^OUTPUT 

# read in the data 
globalvar CTOP 
globalvar ACDITopInited 

localvar fcmdn fcmd tdn dist t base mf mfo nln nxid ferr ferm rxids doit 
25 # checlTthe input parameters 

setvar ferm %cat( SCSJTEMPDIR "/CSenor.log" ) 

setvar ferr %open( Sferm "w* ) 

setvar flogn %cat( $CS__TEMPDIR Vtopfrag.log" ) 

setvar fiog %open( Sflogn "w" ) 
30 setvar fcmdn %cat( $CS_TEMPDIR VCSCommands.cmd* ) 

setvar fcmd %open( $fcmdn "w" ) 

if %not( $fcmd ) 

%write( Sferr could not open temp file Sfcmdn to write ChemSpace search 
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cmds. Quitting ) >$nulidev 
return 
endif 

for tdn in SCS^QUERY 
5 if %pos( $tdn ) 

setvar nogood TRUE 
-if %pos( " <-• $tsln ) 

if %gt( %pos( V $lsln ) %pos( Stsln ) ) 
setvar nogood 
10 endif 
endif 

if Snogood 

%write( $ferr Topomeric searches require a monomolecular search target. 
Quitting ) > Snulldev 
15 goto error 

endif 
endif 

%write( $flog QUERY: $tsln >$nulldev 

setvar dist %CSj)aramj)arse( distance SCS^PARAMETERS 91.0 ) 
20 if %not( $dist ) 

%write( Sferr No topomeric distance provided. Quitting ) >$nulldev 

goto error 
endif 

setvar priority %CSj)araraj)arse( priority $CS_PARAMETERS 3.0 ) 
25 if %not( $priority ) 

%write( Sferr No reaction priority provided. Quitting ) >$nulidev 
goto error 
endif 

%write( Sflog Fragment Priority: Spriority ) >$nulldev 
30 setvar CTOP[ ONLYl ] %CS_paramj»arse( only_subs $CS_PARAMErERS ) 
if SCrOPI ONLY ] 

%write( Sflog Matching Side Chain Only ) >$nulldev 
endif 
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setvar CTOP[ WHGHTS ] %CSj>arainj>arse( xwdghts $CS_PARAMETERS ) 
if $CTOP[ WEIGHTS J 

%write( Sflog User Spedfied Weighting as: $CTOP( WmGHTS ] ) >$nulldev 
for w in $CTOP[ WOGHTS ] 
S setvar pats %search2d( $tsln %aig( 1 %set_unpack( $w ) ) NoDup 0 y ) 

if %not( Spats ) 

%write( $f€XT Wdghted seaixdi for fragment %arg( 1 %set_unpack( $w ) ) 

not 

in $tsln - can't happen! ) >Snundev 
10 goto error 

else 

if %gt( %count( Spats ) 1 ) 

%write( Sflog NOTE: Multiple hits for weighting fragment %arg( 1 
%set_unpack( $w ) ) in Stsln ) >$nuUdev 
15 endif 

endif 
endfor 
endif 

setvar CTOP[ CHBD ] %CSj>aramj>arse( hbonding SCS_PARAMETERS ) 
20 if $CTOP( CHBD 1 . 

%write( Sflog FIELDS include Hydrogai Bonding with weight of SCTOP[ CHBD ] 

) 

>$nulldev 
endif 

25 zap ml >$nulldev 

%sln_to_mol( ml Sisln ) >$nulldev 
if %molempty( ml ) 

%write( Sferr SYBYL cannot handle search target (SLN is: Stsln ). 
Quitting ) >$nulldev 
30 goto error 

endif 

setvar t %moljnfo( ml NATOMS ) 
FILLVALENCEMin H 1.0 L5 1.0 1.5 >$nuUdev 
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if $CTOP[ ONLYl ] 

if %neq( %mol_info( ml NATOMS ) %inath( St + 1 ) ) 
%write( $ferr Side chain search but target Stsln has other than one 
unfilled valence ) > SnuUdev 
S goto error 

endif 

else 

if %neq( %molJnfo( ml NATOMS ) $t ) 
%write( Sferr Search Target $tsln has unfUled valences. Quitting ) 
10 >$nulldev 

goto error 
endif 
endif 

if $CTOP[ ONLYl ] 
IS # only one side chain to modd is a special case 
CTOPISideChainOnly Sfcmd Sferr Sflog $dist 
else 

# dieck for custom topomer fragmentation table or selection 
setvar tftabn 
20 setvar tfrows 

if $CS_TOPFRAG 

setvar t %pos( V $CS_TOPFRAG ) 
if %not( $t ) 

%write( Sferr Custom table name SCS^TOPFRAG missing an ) >$nulldcv 
25 goto error 

else 

setvar tftabn %substr( SCS^TOPFRAG 1 %math( $t - 1 ) ) 
setvar tfrows %substr( SCSJTOPFRAG %math( St + 1 ) ) 
endif 
30 endif 

if %set_and{ "%set_create( %table_.name() )" TOPFRAG ) 

table close TOPFRAG 
endif 
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if %not( Stftabn ) 

setvar tftabn %cat( $DSERV_TB topfrag.tbl ) 
endif 

table recaU Stftabn >$nuUdev 

if %not{ %set_and( ■96set_creale( %lable_naineO )" TOPFRAG ) ) 

%write( $ferr Stftabn not found. Quitting ) >$nulidev 

gc^o error 
endif 

%write( Sflog TopomCT fragmentation table is %cat( SDSERV TB topftag.tbl 
) ) >$nulldev 

# initialize random file name sequence generator 

setvar t %timeO 

setvar base %rand( %substr( "$t" %math( %strlen( "St" ) - 6 ) 2 ) ) 
TAILOR SET MAXIMIN2 MAXIMUM JTERATIONS 1000 | | 
%write( Sflog Master file(s): SCS^DATASET ) >Snulldev 
%write( Sflog TOPFRAG table: Stftabn - Row selection: Stfrows ) 
>$nulldev 

if %not( Stfrows ) 

setvar tfrows %set_create( %range( 1 %table_attribute( NROWS ) ) ) 
aidif 

for rxid in %set_unpack( Stftows ) 

# processing ... 

%wrile( Sflog - ) >$nulldev 

# cheek priority 

TABLE Default TOPFRAG 

if %gt( %rcell( Srxid PRIORITY ) Spriority ) 

%write( Sflog TOPFRAG entry Srxid priority > Spriority. ) >$nulldev 
break 
endif 

setvar CTOP[RxnCount][Srxid] 0 

if %CS_ReactantMatch( Srxid Sfcmd Sferr Stsln Sflog ) 

■f 

%write( Sflog > > > Topomer search queueing (TOPFRAG row Srxid) ) 

>Snulldev 
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CS!Queue_Search $fcmd $rxid $disi $nog 

endif 

endfor 
endif 
endfor 

# may need to purge or rename error file here! 

%close( $fcmd ) 
%close( Sferr ) 
%close( Sflog ) 
return 
error: 

%cIose( Sfcmd ) 

# ensure nothing in search command file ! 

5&file_^delete( $fcmdn ) >$nuUdev 
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CLAIMS 

What is claimed is: 

1, A computer-based method for sdecting, for all possible product molecules which could 
be created in a combinatorial synthesis from spedfied reactant molecules and common core 
molecule, a subset of product molecules, comprising the following steps: 

a. Characterizing all the reactant molecules with a validated molecular structural 
descriptor appropriate to reactant molecules; 

b. Hierarchically clustering the characterized reactant molecules until the intercluster 
distance corresponds to the neighborhood distance of the validated molecular 
structural descriptor or to a value close to the neighborhood distance which creates 
a logical clustering break; 

c. Selecting a reactant molecule from each cluster; 

d. Combinatorially assembling the selected reactant molecules and core molecule into 
products which would be created in the chemical synthesis; 

e. Selecting a product molecule for inclusion in the subset; 

f. Using a validated molecular structural descriptor appropriate to whole molecules, 
calculating the descriptor distance between all selected product molecules and all 
other product molecules; 

g. Determining the shortest distance between each product molecule and all product 
molecules previously selected; 

h. Selecting for inclusion in the subset the product molecule whose shortest descriptor 
distance from the previously selected molecules is the largest and is greater than the 
neighborhood distance of the descriptor; 

I Repeat stqis f through h until the largest shortest diffwence between molecules is less 

than the neighborhood distance of the descriptor; and 
j. Outputing a list of the selected product molecules and/or the reactant molecules from 

which the selected product molecules can be formed. 

2. TTie method of claim 1 in which the validated molecular structural descriptor appropriate 
to reactant molecules is topomeric CoMFA fields. 

3. The method of claim 2 in which topomeric hydrogen bond fields are used in conjunction 
with the topomeric CoMFA fields descriptor. 

4. The method of claim 2 in which the validated molecular structural descriptor appropriate 
to whole molecules is the Tanimoto 2D coefficient. 
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5. The method of claim 4 in which before step g, reactant m lecules with the following 
characteristics are removed from further use in the method: 

a. toxic reactant molecules; 

b. reactant molecules containing metals, improper forms of tautomers» and interfering 
chenucal groups; 

c. reactant molecules with too low a bioavailability; 

d. reactant molecules not likdy to cross membranes; and 

e. reactant molecules containing biologically non-relevant groups. 

6. The method of claim S in which before stq) product molecules with the following 
characteristics are removed from further use in the method: 

a. product molecules having MW ^ 750; and 

b. product molecules not having a CIjOGP between -2 and 7.S. 

7. The method of claim 1 in which the validated molecular structural descriptor appropriate 
to whole molecules is the Tanimoto 2D coeffici^t. 

8. The method of claim 7 in which before step a, reactant molecules with the following 
characteristics are removed from further use in the method: 

a. toxic reactant molecules; 

b. reactant molecules containing metals, improper forms of tautomers, and interfering 
chemical groups; 

c. reactant molecules with too low a bioavailability; 

d. reactant molecules not likely to cross membranes; and 

e. reactant molecules containing biologically non-relevant groups. 

9. The method of claim 8 in which before step e, product molecules with the following 
characteristics are removed from further use in the method: 

a, product molecules having MW ^ 750; and 

b. product molecules not having a CLOGP between -2 and 7.5. 

10. A computer-based method for selecting, for all possible product molecules which 
could be created in a combinatorial synthesis from specified reactant molecules, a subset of 
product molecules, comprising the following steps: 

a. Characteriring all the reactant molecules with a validated molecular structural 
descriptor appropriate to reactant molecules; 

b. Hierarchically clustering the characterized reactant molecules until the intercluster 
distance corresponds to the neighborhood distance of the validated molecular 
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structural descriptor r to a value close to the ndghborhood distance which creates 
a logical clustering break; 

c. Selecting a reactant molecule from each cluster; 

d. Combinatorially assembling the sdected reactant molecules and core molecule into 
products which would be created in the chemical synthesis; 

e. Selecting a product molecule for inclusion in the subset; 

f. Using a validated molecular structural descriptor appropriate to-whole molecules, 
calculating the descriptor distance between all selected product molecules and all 
other product molecules; 

g. Determining the shortest distance between each product molecule and all product 
molecules previously selected; 

h. Selecting for inclusion in the subset the product molecule whose shortest descriptor 
distance from the previously selected molecules is the largest and is greater than the 

' neighborhood distance of the descriptor; 

i. Repeat steps f through h until the largest shortest difference between molecules is less 
than the neighborhood distance of the descriptor; and 

j. Outputing a list of the selected product molecules and/or the reactant molecules from 
which the selected product molecules can be formed. 
IL The method of claim 10 in which the validated molecular structural descriptor 
appropriate to reactant molecules is topomeric CoMFA fields. 

12. The method of claim 1 1 in which topomeric hydrogen bond fields are used in 
conjunction with the topomeric CoMFA fields descriptor. 

13. The method of claim 1 1 in which the validated molecular structural descriptor 
appropriate to whole molecules is the Tanimoto 2D coefficient. 

14. The method of claim 13 in which before step a, reactant molecules with the following 
characteristics are removed from further use in the method: 

a. toxic reactant molecules; 

b. reactant molecules containing metals, improper forms of tautomers, and interfering 
chemical groups; 

c. reactant molecules with too low a bioavailability; 

d. reactant molecules not likely to cross membranes; and 

e. reactant molecules containing biologically non-relevant groups. 

15. The method of claim 14 in which before step g, product molecules with the following 
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characteristics are removed from further use in the method: 

a. product molecules having MW > 750; and 

b. product molecules not having a CLXXjP between -2 and 7.5. 

16. The mediod of claim 10 in which the validated nralecular structural descriptor 
5 appropriate to whole molecules is the Tanimoto 2D coefficient 

17. The method of claim 16 in which before step a, reactant molecules with the following 
characteristics are-removed from further use in the method: — 

a. toxic reactant molecules; 

b. reactant molecules containing metals, improper forms of tautomers, and interfering 
10 chemical groiq>s; 

c. reactant molecules with too low a bioavailability; 

d. reactant molecules not likely to cross membranes; and 

e. reactant molecules containing biologically non-relevant groups. 

18. The method of claim 17 in which before step product molecules with the following 
15 characteristics are removed from further use in the method: 

a. product molecules having MW 2r 750; and 

b. product molecules not having a CLOGP between -2 and 7.5. 

19. A system for selecting^ for all possible product molecules which can be created in a 
combinatorial synthesis from all specified reactant molecules and common core molecule, a 

20 subset of product molecules whose members collectively rq)resent most of the molecular 
structural diversity in the possible combinatorially synthesized product molecules, comprising: 

a. Means for characterizing all the reactant molecules with a validated molecular 
structural descriptor appropriate to reactant molecules; 

b. Means for hierarchically clustering the characterized reactant molecules until the 
25 intercluster distance corresponds to the neighborhood distance of the validated 

molecular structural descriptor or to a value close to the neighborhood distance which 
creates a logical clustering break; 

c. Means for selecting one reactant molecule from each cluster; 

d. Means for combinatorially assembling the selected reactant molecules and core 
30 molecule into products which would be created in the chemical synthesis; 

e. Means for selecting at least one product molecule for inclusion in the subset; 

f. Means for using a validated molecular structural descriptor applicable to whole 
molecules for calculating the descriptor distance between all selected product 
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molecules and all other product molecules; 

g. Means for determining the shortest distance between each product molecule and all 
product molecules previously selected; 

h. Means for selecting for includon in the subset the product molecule whose shortest 
descriptor distance from the previously selected molecules is the laigest and is greater 
than the neighborhood distance of the descriptor; 

i. Means for invoking means f through h until the laigest shortest difference between- 
molecules is less than the neighborhood distance of the descriptor; and 

j. Means for ou^uting a list of the selected product molecules and/or the reactant 
molecules from which the sdected product molecules can be formed. 

20. The system of claim 19 in which the reactant appropriate molecular structural 
descriptor is topomeric CoMFA fields. 

21. The system of claim 20 in which topomeric hydrogen bond fields are used in 
conjunction with the topomeric CoMFA fields descriptor. 

22. The system of claim 20 in which the whole molecule apprq)riate molecular structural 
descriptor is the Tanimoto 2D coefficient. 

23. A system for selecting, for all possible product molecules which can be created in a 
combinatorial synthesis from all specified reactant molecules, a subset of pnxluct molecules 
whose members collectively represent most of the molecular structural diversity in the possible 
combinatorially synthesized product molecules, comprising: 

a. Means for characterizing all the reactant molecules with a validated molecular 
structural descriptor appropriate to reactant molecules; 

b. Means for hierarchically clustering the characterized reactant molecules until the 
intercluster distance corresponds to the ncighbortiood distance of the validated 
molecular structural descriptor or to a value close to the neighborhood distance which 
creates a logical clustering break; 

c. Means for selecting one reactant molecule from each cluster; 

d. Means for combinatorially assembling the selected reactant molecules into products 
which would be created in the chemical synthesis; 

e. Means for selecting at least one product molecule for inclusion in the subset; 

f. Means for using a validated molecular structural descriptor applicable to whole 
molecules for calculating the descriptor distance between all selected pixxiuct 
molecules and all other product molecules; 
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g. Means f r determining the shortest distance between each product molecule and all 
product molecules previously selected; 

h. Means for selecting for inclusion in the subset the product molecule whose shortest 
descriptor distance from the previously selected molecules is the largest and is greater 

5 than the neighborhood distance of the descriptor; 

i. Means for invoking means f through b until the largest shortest difference between 
molecules is less than the neighborhood distance of the descriptor; and- 

j. Means for outputing a list of the selected product molecules and/or the reactant 
molecules from which the selected product molecules can be formed. 
10 24. The system of claim 23 in which the reactant appropriate molecular structural 
descriptor is topomeric CoMFA fields. 

2S* Hie system of claim 24 in which topomeric hydrogen bond fields are used in 
conjunction with the t<^meric CoMFA fields descriptor. 

26. The system of claim 24 in which the whole molecule appropriate molecular structural 
IS descriptor is the Tanimoto 2D coefficient. 

27. A combinatorial screwing library designed by a computer-based method, which 
selects the screening library molecules from those molecules which could be created in a 
combinatorial synthesis from specified reactant molecules and common core molecule, 
comprising the following steps: 

20 a. Characterizing all the reactant molecules with a validated molecular structural 

descriptor qipropriate to reactant molecules; 

b. Hierarchically clustering the characterized reactant molecules until the intercluster 
distance corresponds to the neighborhood distance of the validated molecular 
structural descriptor or to a value close to the neighborhood distance which creates 

2S a logical clustering break; 

c. Selecting a reactant molecule from each cluster; 

d. Combinatorially assembling the selected reactant molecules and core molecule into 
products which would be created in the chemical synthesis; 

e. Selecting a product molecule for inclusion in the subset; 

30 f. Using a validated molecular structural descriptor appropriate to whole molecules, 

calculating the descriptor distance between all selected product molecules and all 
other product molecules; 
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g- Detennining the shortest distance between each product molecule and all product 
molecules previously selected; 

h. Selecting for inclusion in the subset the product molecule whose shortest descriptor 
distance from the previously selected molecules is the largest and is greater than the 
neighborhood distance of the de^ptor; 

i. Repeat stqjs f through h until the largest shorted difference between molecules is less 
than the neighborhood distance of the descriptor; and 

j. Outputing a list of the selected produa molecules and/or the reactant molecules from 
which the selected product molecules can be formed. 

28. Hie method of claim 27 in which the validated molecular structural descriptor 
appropriate to reactant molecules is lopomeric CoMFA fidds. 

29. The method of claim 28 in which tqwmeric hydrogen bond fields are used in 
conjunction with the topomeric CoMFA fields descriptor. 

30. The method of claim 28 in which the validated molecular structural descriptor 
appropriate to whole molecules is the Tanimoto 2D coefficient 

31. A combinatorial screening Ubrary designed by a computer-based method, which 
selects the screening library molecules from those molecules which could be created in a 
combinatorial synthesis from specified reactant molecules, comprising the following steps: 

a. Characterizing all the reactant molecules with a validated molecular structural 
descriptor appropriate to reactant molecules; 

b. Hierarchically clustering the characterized reactant molecules until the intercluster 
distance corresponds to the neighborhood distance of the validated molecular 
structural descriptor or to a value close to the neighborhood distance which creates 
a logical clustering break; 

c. Selecting a reactant molecule from each cluster; 

d. Combinatorially assembUng the selected reactant molecules and core molecule into 
products which would be created in the chemical synthesis; 

e. Selecting a product molecule for inclusion in the subset; 

f. Using a validated molecular structural descriptor appropriate to whole molecules, 
calculating the descriptor distance between all selected product molecules and all 
othCT product molecules; 

g. Determining the shortest distance between each product molecule and all product 
molecules previously sdected; 
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Selecting for inclusion in the subset the product molecule whose shortest descriptor 
distance from the previously ^ected molecules is the largest and is greater than the 
neighborhood distance of the descriptor; 
i. Rqieat stq)s f through h until the largest shortest differaice between molecules is less 
5 than the ndghborhood distance of the descriptor; and 

j. Outputing a list of the sdected product molecules and/or the leactant molecules from 
which the selected product molecules-can be formed. 
32. The method of claim 31 in which the validated molecular structural descriptor 
sq^ipropriate to reactant molecules is topomeric CoMFA fidds. 
10 33. The method of claim 32 in which topomeric hydrogen bond fields are used in 
conjunction with the topomeric CoMFA fields descriptor. 

34, The method of claim 32 in which the validated molecular structural descriptor 
appropriate to whole molecules is the Taniraoto 2D coefficient. 

35. A computer-based method for charactwizing the relative validity or usefulness of 
15 molecular structural descriptors using multiple literature data sets containing a variety of 

chemical structures and associated activities comprising the following steps: 

a. Applying tiie molecular structural descriptors to all compounds represented in each 
data set to derive descriptor values; 

b. Constructing a Patterson plot for each molecular structural descriptor for each data 
20 set using the descriptor values for the compounds in each data set and their associated 

activities; 

c. Determining the appropriate Patterson plot line and the corresponding density ratio 
for each molecular structural descriptor for each data set; 

d. Determining tiie number of data sets for each- molecular structural descriptor for 
25 which the Patterson plots have a density ratio greater than a predetermined cut-off 

value; and 

e. Creating a ranking ratio for each molecular structural descriptor in which ttie 
numwator is the number determined in step d and the denominator is the number of 
data sets, said ranking ratio for each molecular structural descriptor being 

30 representative of the relative validity or usefulness of each molecular structural 

descriptor wherein higher values of the ranking ratio represent a higher degree of 
validity/usefulness. 
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36. The method of claim 35 in which in step d the pred^ermined cut-off is about 1.1. 

37. A computer-based method f merging with a base assembly of molecules one or more 
additional assemblies of molecules, similar molecules in the assemblies having previously been 
identified and removed using a validated molecular structural descriptor, comprising die stq>s 
of: 

a. Using a validated molecular structural descriptor which is appropriate to whole 
molecules, characterizing all the molecules in the base assembly of molecules and in 
the assembly of molecules to be merged; 

b. Calculating the molecular structural distance between every molecule in the base 
assembly to every molecule in the assembly to be merged; 

c. While there are still molecules in the assembly to be merged which have not been 
tested, selecting a molecule from the assembly to be merged; 

d. Determining whether the molecular structural distance between the selected molecule 
and every nwlecule in the base assembly is within the neighborhood distance of the 
molecular structural descriptor; 

e. Sdect for inclusion in the merged assemblies only those molecules identified in st&p 
d as having molecular structural distances greater than the ndghborhood distance. 

f. Rq)eat step £ through step g until all molecules in the assembly to be merged have 
been tested; and 

g. Repeat step g through stq> f for each additional assembly to be merged. 

38. The method of claim 37 in which the molecular structural descriptor ^propriate to 
whole molecules in the Tanimoto similarity coefficient. 

39. A computer-based method of meiging with a base assembly of molecules one or more 
additional assemblies of molecules, similar molecules in one or more of the assemblies having 
not previously hem identified and removed using a validated molecular structural descriptor, 
comprising the steps of: 

a. Selecting subsets of each assembly by: 

(1) Selecting a molecule within each assembly; 

(2) Using a validated molecular structural descriptor appropriate to whole 
molecules, calculating the descriptor distance between the selected molecule and 
all mdecules within the assembly; 



(3) Determining the shortest disumce between the selected molecule and all 
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molecules previously selected for the subset; 
(4) Sdecting for inclusion in the subset the molecule whose shortest descriptor 

distance from the previously selected molecules is the largest and is greater 

than the neighborhood distance of the descriptor; 
S (S) Rq)eat steps £jQ through (4) until the largest shortest difference between 

molecules is less than the ndghborhood distance of the descriptor, and 
(6) Repeat stq)s £1} through (S) for each assembly; 

b. Using a validated molecular structural descriptor which is appropriate to whole 
molecules, characterizing all the molecules in the base assembly of molecules and in 

10 the assembly of molecules to be merged; 

c. Calculating the molecular structural distance tietween every molecule in the base 
assembly to every molecule in the assembly to be merged; 

d. While there are still molecules in the assembly to be merged which have not been 
tested, selecting a molecule from the assembly to be merged; 

IS e. Determining whether the molecular structural distance between the selected molecule 

and every molecule in the base assembly is within the neighborhood distance of the 

molecular structural descriptor; 
f. Select for inclusion in the merged assemblies only those molecules identified in stq) 

£ as having molecular structural distances greater than the neighborhood distance. 
20 g. Repeat stq) d through step f until all molecules in the assembly to be merged have 

been tested; and 

h. Repeat step ^ through step g for each additional assembly to be merged. 
40. The use of a subset of nK>lecules, which could be made in a combinatorial synthesis 
of specified reactants-and core, to specify the compounds to be synthesized and tested in 
25 biological screening assays, said subset being selected by the following computer-based 
method: 

a. Characterizing ail the reactant molecules with a validated molecular structural 
descriptor ai^n^riate to reactant molecules; 

b. Hierarchically clustering the diaracterized reactant molecules until the intercluster 
30 distance corresponds to the neighborhood distance of the validated molecular 

structural descriptor or to a value close to the neighborhood distance which creates 
a logical clustering break; 

c. Selecting a reactant molecule from each cluster; 
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d. Combinatorially assembling the selected reactant molecules and core molecule into 
products which would be created in the chemical synthesis; 

e. Sdecting a product molecule for inclusion in the subset; 

f. Using a validated molecular structural descriptor s^iopriate to whole molecules, 
calculating the descriptor distance between all sdected product molecules and all 
other product molecules; 

g. Determining the shortest jdistance between each product molecule and all i»oduct 
molecules previously selected; 

h. Selecting for inclusion in the subsrt the product molecule whose shortest descriptor 
distance from the previously sdected molecules is the largest and is greater than the 
neighborhood distance of the descriptor; 

i. Repeat steps f through h until the largest shortest difference between molecules is less 
than the neighborhood distance of the descriptor; and 

j, Outputing a list of the sdected product molecules and/or the reactant molecules from 
which the selected product molecules can be formed. 

41. The method of claim 40 in which the validated molecular structural descriptor 
q>propriate to reactant molecules is tppomeric CoMFA fields. 

42. The method of claim 41 in which topomeric hydrogen bond fields are used in 
conjunction with the topomeric CoMFA fields descriptor. 

43. The method of claim 41 in which the validated molecular structural descriptor 
appropriate to whole molecules is the Tanimoto 2D coefficient. 

44. The molecules selected, from those which could be made in a combinatorial synthesis 
of specified reactants and core, by the following computer-based method: 

a. Characterizing all the reactant molecules with a validated molecular structural 
descriptor appropriate to reactant molecules; 

b. Hierarchically clustering the characterized reactant molecules until the intercluster 
distance corresponds to the ndghborhood distance of the validated molecular 
structural descriptor or to a value close to the neighborhood distance which creates 
a logical clustering break; 

c. Selecting a reactant molecule from each cluster; 

d. Combinatorially assembling the selected reactant molecules and core molecule into 
products which would be created in the chemical synthesis; 

e. Selecting a product molecule for inclusion in the subset; 
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f. Using a validated molecular structural descriptor a^ropriate to whole molecules, 
calculating the descriptor distance between all selected product molecules and all 
other product molecules; 

g. Determining the shortest distance between each product molecule and all product 
S molecules previously selected; 

h. Selecting for indusion in the subset the product molecule whose shortest descriptor 
distance from the previously selected molecules is the largest and is greater than the 
neighborhood distance of the descriptor; 

i. Rq>eat stq>s f through h until the largest shortest difference between molecules is less 
10 than the neighborhood distance of the descriptor; and 

j . Outputing a list of the selected product molecules and/or the reactant molecules from 
which the selected product molecules can be formed. 
45. The method of claim 44 in which the validated molecular structural descriptor 
appropriate to reactant molecules is topomeric CoMFA fields. 
IS 46. The method of claim 45 in which topomeric hydrogen l>ond fields are used in 
conjunction with the topomeric CoMFA fidds descriptor. 

47. The method of claim 45 in which the validated molecular structural descriptor 
appropriate to whole molecules is the Tanimoto 2D coefficient. 

48. A computer-based method of determining the neighborhood distance characteristic of 
20 a validated molecular structural descriptor using multiple literature data sets containing a 

variety of chemical structures and associated activities, comprising the following stq)s: 

a. Applying the molecular structural descriptor to all compounds represented in each 
data set to derive descriptor values; 

b. Constructing a Patterson plot for each molecular structural descriptor for each data 
25 set using the descriptor values for the compounds in each data set and their associated 

activities; 

c. Determining the appropriate Patterson plot line for each data set; 

d. Using for each data set a point on the Y axis of the corresponding Patterson plot the 
end point of an activity difference for which a neighborhood distance is desired, 

30 determining the X axis values of the molecular structural descriptor corresponding to 

the projection from the Patterson plot line of the end points of the activity difference; 

e. Determining the average range of values for the neighborhood distance from the plots 
for each of the data sets. 
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49. A method of determining the molecules within any set which are most likdy to have 
the same activity as a lead molecule previously identified in an assay comprising the following 
stq>s: 

a. Characteriang the lead molecule and all other compounds to be examined using a 
validated molecular structural descriptor appropriate to \riiole molecules; 

b. Etetomining the molecular structural descriptor distances betwem the lead molecule 
and all the oth^ molecules; and 

c. Identifying the molecules whose distances from the lead molecule fall within the 
neighborhood distance of the lead. 

50. The method of claim 49 further comprising the additional steps of; 

d. Determining the molecular structural descriptor distances between the set of 
molecules previously identified and all the other molecules excluding the lead and the 
sets; 

e. Identifying the molecules whose distanced from molecules in (he previously selected 
set fall within the ndghborhood distance; and 

f. Rq)eating stq>s d through g as many times as desired. 

51. A method of d^mining the useful boundaries of exploration within any set of 
molecular structures for molecules possessing the same activity as a lead molecule previously 
identified in an assay comprising the following steps: 

a. Characterizing the lead molecule and all other compounds to be examined using a 
validated molecular structural descriptor qipropriate to whole molecules; 

b. Determining the molecular structural descriptor distances between the lead molecule 
and all the other molecules; and 

c. Identifying the molecules whose distances from the lead molecule fall within the 
neighborhood distance of the lead; 

d. Synthesizing and testing in an assay the molecules identified in step c and if no 
activity is detected, stop. 

e. If activity is detected, calculating molecular structural descriptor distances, from each 
molecule identified in the previous step as showing activity, to all other compounds 
(excluding the lead compound and each previously identified active compound); 

f. Identifying all molecules within the neighborhood diameter of the previously 
identified active molecules; 

g. Synthesizing and testing in an assay the molecules identified in the previous step, and 
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if no activity is detected, stop; and 
h. Rq)eating steps q through g until no further compounds show activity in the assay. 

52. A computer-based method of characterizing the three dimensional structure of 
reactants, which can assume many conformations, comprising the stq>s of: 

S a. Topom^cally aligning the reactants; and 

b. Determining the CoMFA steric fields for each topomerically aligned reactant. 

53. The method of claim 52 furth^ comprising the addition of topomeric hydrogen 
bonding fields to the CoMFA steric fields. 

54. A computer-based method of applying a molecular structural descriptor to a set of 
10 reactants compri^ng the following steps: 

a. Topomerically aligning the reactants; 

b. Determining the CoMFA steric fields for each topomerically aligned reactant; and 

c. Calculating the field differences between all pairs of reactants. 

55. The method of claim 54 further comprising after step b the additional stq) of adduig 
IS topomeric hydrogen bonding fields to the CoMFA fields. 

56. The method of claim 54 further comprising after step £ the additional step of 
hierarchically clustering the reactants until the intercluster distance is about 80- 100 CoMFA 
field units. 

57. In a digital computer in which rqiresentations of specified reactant molecules and a 
20 core molecule have been stored, a computer-based method for selecting, for all possible 

product molecules which could be created in a combinatorial synthesis from the reactant 
molecules and common core molecule, a subset of product molecules* comprising the following 
stq)s: 

a. Characterizing all the reactant molecules with a validated molecular structural 
25 descriptor appropriate to reactant molecules; 

b. Hierarchically clustering the characterized reactant molecules until the intercluster 
distance corresponds to the neighborhood distance of the validated molecular 
structural descriptor or to a value close to the neight>6rhood distance which creates 
a logical clustering break; 

30 c. Selecting a reactant molecule from each cluster; 

d. Combinatorially assembling the selected reactant molecules and core molecule into 
products which would be created in the chemical synthesis; 

e. Selecting a product molecule for inclusion in the subset; 
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f. Using a validated molecular structural descriptor appropriate to whole molecules, 
calculating the descriptor distance between all selected product molecules and all 
other product molecules; 

g. Detomining the shortest distance betwe^ each product molecule and all product 
molecules previously selected; 

h. Selecting for inclusicm in the subset the product molecule whose shortest descriptor 
distance from the previously selected molecules is the largest and is greater than the 
neighborhood distance of the descriptor; 

i. Rq)eat stqis f through h until the largest shortest difference between molecules is less 
than the neighborhood distance of the descriptor; and 

j. Ou^iuting a list of the selected product molecules and/or the reactant molecules from 
which the selected product molecules can be formed. 

58. The method of claim 57 in which the validated molecular structural descriptor 
apprqiriate to reactant molecules is topomeric CoMFA fields. 

59. The method of claim 58 in which topomeric hydrogen bond fields are used in 
conjunction with the topomeric CoMFA fields descriptor. 

60. The method of claim 57 in which the validated molecular structural descriptor 
^ropriate to whole molecules is the Tanimoto 2D coefficient. 

61 . A computer-based method for generating a virtual library of possible combinatorially 
derived product molecules which can be searched for product molecules having desired 
properties without the necessity of generating the product structure during the search, 
comprising the following stq>s: 

a. Creating one or more files identifying one or more combinatorial reactions for one or 
more core structures; 

b. Creating sqwrote structural variation files (associated with the reaction identifying files) 
in which are listed together the structural variations rq)resentative of those reactants 
which will react at each variation site of each combinatorial reaction; 

c. Associating with each structural variation, data, characterizing each structural variation 
including: 

(1) Characterization data, taking into account when necessary the structures of the 
cores with which the structural variations would be combined in the listed 
combinatorial syntheses, which has not been derived from the application of 
validated molecular structural descriptors; and 
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(2) Characterizing data, taking into account when necessary the structures of the cores 
with which the structural variations would be combined in the listed combinatorial 
syntheses, which has been derived from applying validated molecular structural 
descriptors to the structural variations. 
S 62. A virtual library of possible combinatorially derived product molecules which can be 
searched for product molecules having desired properties without the necessity of generating 
the product structures during the search, generated by the following process: 

a. Creating one or more files identifying one or more combinatorial reactions for one or 
more core structures; 

10 b. Creating separate structural variation files (associated with the reaction identifying files) 
in which are listed together the structural variations rq)resentative of those reactants 
whidi will react at each variation site of each combinatorial reaction; 
. Cv Associating with each structural variation, data, charact^izing each structural variation * 
including: 

IS (1) Characterization data, taldng into account when necessary the structures of the 

cores with which the structural variations would be combined in the listed 
combinatorial syntheses, which has not been derived from the sq)plication of 
validated molecular structural descriptors; and 
(2) Characterizing data, taking into account when necessary the structures of the cores 
20 wilh which the structural variations would be combined in the listed combinatorial 

syntheses, which has been derived from applying validated molecular structural , 
descriptors to the structural variations. 
63. The method of claim 61 further comprising a computer-based method for selecting 
from the virtual library, for all possible product molecules which could be created by all 
25 combinatorial arrangements of specified structural variations and a common core molecule, a 
subset of product molecules, comprising the following additional steps: 

b. identifying all possible combinatorial product molecules which could result from the 
specified reactants and selected core molecules; 

c. selecting from all possible combinatorial product molecules a product molecule for 
30 inclusion in the subset; 

d. using a validated molecular descriptor appropriate to whole molecules with which the 
Virtual Library was generated, removing from the set of all remaining molecules those 
molecules falling within a chosen neighborhood distance of the selected molecule; 
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e. using a validated molecular descriptor appropriate to the stnictural variations with which 
the Virtual Library was generated, removing from the set of all remaining pixxluct 
molecules those molecules formed from structural variations falling within a chosen 
neighborhood distance of the structural variations of the selected molecule; 
5 f . selecting from the set of all product molecules remaining after step e a product molecule 
for inclusion in the subset; 

g. repeating steps d through f until no additional product molecules-remain to be sdected 
in step f ; and 

h. Outputting a list of the sdected subset and/or the structural variations from which the 
10 subset can be formed. 

64. The method of claim 61 further comprising a computer-based method for selecting 
from the virtual library, for all possible product molecules which could be created by all 
combinatorial arrangements of specified structural variations and core molecules, a subset of 
product molecules, comprising the following additional steps: 
15 b. selecting from all possible cores a core upon which to base the subset; 

c. using a validated molecular descriptor appropriate to cores, selecting from the set of all 
possible cores those core molecules felling within the neighborhood distance of the 
selected core molecule; 

d. identifying all possible combinatorial product molecules which could result from the 
20 specified struaural variations and selected core molecules; 

e. selecting from all possible combinatorial product molecules a product molecule for 
inclusion in the subset; 

f . uang a validated molecular descriptor appropriate to whole molecules widi which the 
Virtual Ubrary was generated, removing from the set of all remaining molecules those 

25 molecules falling within a chosen neighborhood distance of the selected molecule; 

g. using a validated molecular descriptor appropriate to the structural variations with which 
the Virtual Library was generated, removing from the set of all remaining product 
molecules those molecules formed from structural variations falling within a chosen 
ndghborhood distance of the structural variations of the selected molecule; 

30 h. selecting from the set of all product molecules remaining after step g a product molecule 
for inclusion in the subset; 

i. rq)eating steps f through h until no additional product molecules remain to be selected 
in stq) h; and 
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j. Outputting a list of the selected subset and/or the structural variations and cores firom 

which the subset can be formed. 
65. The method of claim 61 further comprising a computer-based method for selecting 
from the virtual library, for all possible product molecules which could be created by all 
S combinatorial arrangements of specified structural variations and a common core molecule, a 
subset of product molecules, comprising the following additional steps: 

b. identifying all posable combinatorial product molecules which could result from the 
specified reactants and selected core molecules; 

c. selecting from all possible combinatorial product molecules a product molecule for 
10 inclusion in the subset; 

d. using a validated molecular descriptor s^ropriate to whole molecules with which the 
Virtual Library was generated, removing from the set of all remaining molecules those 
molecules falling within the neighborhood distance of the selected molecule; 

e. selecting from the set of all product molecules remaining after step d a product molecule 
IS for inclusion in the subset; 

f. repeating steps d through e until no additional product molecules remain to be selected 
in step f; and 

g. Ouputting a list of the selected subset and/or the structural variations from which the 
subset can be formed. 

20 66. The method of claim 61 further comprising a computer-based method for selecting 

* 

from the virtual library, for all possible product molecules, which could be created by all 
combinatorial arrangements of specified structural variations and a common core molecule, a 
subset of product molecules, comprising the following additional steps: 

b. identifying all possible combinatorial product molecules which could result from the 
25 specified reactants and selected core molecules; 

c. selecting from all possible combinatorial product molecules a product molecule for 
inclusion in the subset; 

d. using a validated molecular descriptor appropriate to the structural variations with which 
the Virtual Library was generated, removing from the set of all remaining product 

30 molecules those molecules formed from structural variations falling within a chosen 

neighborhood distance of the structural variations of the selected molecule; 

e. selecting from the set of all product molecules remaining after step d a product molecule 
for inclusion in the subset; 



* 
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f. repeating steps d through e unUl no additional product moleculK remain to be selected 
in st^ e; and 

g. Ouputting a list of th selected subset and/or the structural variations from which the 
subset can be formed. 

67. A screening library designed by a computer-based method which selects the screening 
library molecules from those molecules which could be created by all combinatorial 

arrangements of specified structural variations and a common core molecule comprising the 
following steps: 

a. generating a virtual library by: 

(1) . creaUng one or more files identifying one or more combinatorial reactions for one 

or more core structures; 

(2) . creating separate structural variation files (associated with the reaction identifying 

files) in which are listed together the structural variations representative of those 
reactants which will react at each variation site of each combinatorial reaction; 

(3) . associating with each structural variation. daU, characterizing each structural 

variation including: 

(a) , characterization data, taldng into account when necessary the structures of the 

cores with which the structural variations would be combined in the listed 
combinatorial syntheses, which has not been derived from tiie application of 
validated molecular structural descriptors; and 

(b) . characterizing data, taking into account when necessary the structures of the 

cores with which the structural variations would be combined in the listed 
combinatorial syntheses, which has been derived from applying validated 
molecular structural descriptors to the structural variations; 

b. identifying in the virtual library all possible combinatorial product molecules which 
could result from the specified reactants and selected core molecules; 

c. selecting from all possible combinatorial product molecules a product molecule for 
inclusion in the subset; 

d. using a validated molecular descriptor appropriate to whole molecules with which the 
Virtual Library was generated, removing from the set of all remaining molecules those 
molecules felling within a chosen neighborhood distance of the selected molecule; 

e. using a validated molecular descriptor appropriate to the structural variations with which 
the Virtual Library was generated, removing from the set of all remaining product 
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molecules those molecules foimed from stnictural variations falling within a chosen 
neighborhood distance of the structural variations of the selected molecule; 
f . selecting firom the set of all product molecules remaining after step e a product molecule 
for inclu^on in the subset; 
5 g. rq>eating steps d through f until no additional product molecules remain to be selected 

in step f ; and 

h. Quitting a list-of-the selected subset and/or the stru<^nd variations from which the 
subset can be formed. 

6B. A screoiing library designed by a computer-based method which selects the screening 
10 library molecules from those molecules which could be created by all combinatorial 
arrangements of specified structural variations and core molecules comprising the following 
stq>s: 

a, generating a virtual library by: 

(1) . creating one or more files identifying one or more combinatorial reactions for one 
15 or more core structures; 

(2) , creating sqxuate structural variation files (associated with the reaction id^tifying 

files) in which are listed togeth^ the structural variations representative of those 
reactants which will react at each variation site of each combinatorial reaction; 

(3) . associating with each structural variation, data, characterizing each structural 
20 variation including: 

(a), characterization data, taking into account when necessary the structures of the 
cores with which the structural variations would be combined in the listed 
combinatorial syntheses, which has not been derived from the application of 
validated molecular structural descriptors; and 
25 (b). characterizing data, taking into account when necessary the structures of the 

cores with which the structural variations would be combined in the listed 
combinatorial syntheses, which has been derived from applying validated 
molecular structural descriptors to the structural variations; 
b, selecting from all possible cores a core upon which to base the subset; 
30 c. using a validated molecular descriptor impropriate to cores, selecting from the set of all 
possible cores those core molecules falling within the neighborhood distance of the 
selected core molecule; 
d. identifying all possible combinatorial product molecules which could result from the 
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specified reactants and selected core molecules; 
e. selecting from all possible combinatorial product molecules a product molecule for 

inclusion in the subset; 
1 using a validated molecular descriptor appn^riate to whole molecules with which the 

Virtual library was generated, removing from the set of all remaining molecules those 

molecules falling within a chosen neighborhood distance of the selected molecule; 

g. using a validated molecular descriptor appropriate to the structural variations with which 
the Virtual library was generated, removing from the set of all remaining product 
molecules those molecules formed from structural variations falling witiiin a chosen 
neighborhood distance of the structural variations of the selected molecule; 

h. sdecting from the set of all product molecules remaining after step g a product molecule 
for inclusion in the subset; 

i. rq)eating steps f through h until no additional product molecules remain to be selected 
in step h; and 

j. Ou^utting a list of the selected subset and/or the structural variations and cores from 
which the subs^ can be formed. 

69. The use of a subset of molecules, which could be made in a combinatorial synthesis 
of specified reactants and conimon core, to specify the compounds to be synthesized and tested 
in appropriate assays, said subset being selected by the following computer-based method: 

a. generating a virtual library by: 

(1) . creating one or more files identifying one or more combinatorial reactions for one 

or more core structures; 

(2) . creating sqmrate structural variation files (associated with the reaction identifying 

files) in which are listed together the structural variations rq)resentative of tho^ 
reactants which^U react at each variation site of each combinatorial reaction; 

(3) . associating with each structural variation, data, characterizing each structural 

variation including: 

(a) , characterization data, taking into account when necessary the structures of the 

cores with which the structural variations would be combined in the listed 
combinatorial syntheses, which has not been derived from the application of 
validated molecular structural descriptors; and 

(b) . characterizing data, taking into account when necessary the structures of the 

cores with which the structural variations would be combined in the listed 
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combinatorial syntheses, which has been derived from applying validated 
molecular structural d^riptors to the structural variations; 

b. identifying in the virtual library all possible combinatorial product molecules which 
could result from the spedfied reactants and selected core molecules; 

c. selecting from all possible combinatorial product molecules a product molecule for 
inclusion in the subs^; 

d. -using a validated molecular descriptor s^ropriate to whole molecules vnth which the 
Virtual Library was generated, removing from the set of all renudning molecules those 
molecules falling within a chosen ndghborhood distance of the selected molecule; 

e. using a validated molecular descriptor appropriate to the structural variations with which 
the Virtual Library was graerated, removing from the set of all renuuning product 
molecules those molecules formed from structural variations falling within a chosen 
ndghborhood distance of the structural variations of the selected molecule; 

f . selecting from the set of all product molecules remaining after step e a product molecule 
for inclusion in the subset; 

g. repeating steps d through f until no additional product molecules remain to be selected 
in step f ; and 

h. Ou^utting a list of the selected subset and/or the reactants from which the subset can 
be formed. 

70. The molecules selected, from fliose which could be made in a combinatorial synthesis 
of specified reactants and common core, by the following computer-based method: 
a. generating a virtual library by: 

(1) . creating one or more files identifying one or more combinatorial reactions for one 

or more core structures; 

(2) . creating separate structural variation files (associated with the reac^on identifying 

fdes) in ^ch are listed together the structural variations representative of Uiose 
reactants which will react at each variation site of each combinatorial reaction; 

(3) . associating witii each structural variation, data, characterizing each structural 

variation including: 

(a), charact^ization data, taking into account when necessary the structures of the 
cores with which die structural variations would be combined in the listed 
combinatorial syntheses, which has not been derived from the application of 
validated molecular structural descriptors; and 
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(b). characterizing data, taking into account when necessary the structures of die 
cores with which the structural variations would be combined in the listed 
combinatorial syntheses, which has been derived from applying validated 
molecular structural descripton to the structural variations; 

b. identifying in the virtual library all possible combinatorial product molecules which 
could result from the specified reactants and core molecule; 

c. selecting from alt possible combinatorial product molecules a product molecule for 
inclusion in the subset; 

d. using a validated nK>lecular descriptor aiq>n9riate to whole molecules with which the 
Virtual Library was generated, removing from the set of all remaining molecules those 
molecules falling within a chosen neighborhood distance of the selected molecule; 

e. using a validated molecular descriptor appropriate to the structural variations with which 
the Virtual library was generated, removing from the set of all remaining product 
molecules those molecules formed from structural variations falling within a chosen 
neighborhood distance of the stnictural variations of the selected molecule; 

f . selecting from the set of all product molecules remaining after step e a product molecule 
for inclusion in the subset; 

g. rq)eating steps d through f until no additional product molecules remain to be selected 
in stq) f ; and 

h. Ou^utting a list of the selected subset and/or the reactants from which the subset can 
be formed. 

71 . The nK)lecules selected, from those which could be made in a combinatorial synthesis 
of specified reactants and cores, by the following computer-based method: 
a, generating a virtual library by: 

(1) . creating one or more files identifying one or more combinatorial reactions for one 

or more core structures; 

(2) , creating sqiarate structural variation files (associated with the reaction identifying 

files) in which are listed together the structural variations representative of those 
reactants which will react at each variation site of each combinatorial reaction; 

(3) . associating with each structural variation, data, characterizing each structural 

variation including: 

(a), characterization data, taking into account when necessary the structures of the 
cores with which the structural variations would be combined in the listed 
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combinatorial syntheses, which has not been derived from the application of 
validated molecular structural descriptors; and 
(b). characterizing data, taking into account when necessary the structures of the 
cores with whid) the structural variations would be combined in the listed 
5 combinatorial syntheses, which has been derived from applying validated 

molecular structural descriptors to the structural variations; 
-b. selecting from all posdble cores a core upon which to base the subset; 

c. using a validated molecular descriptor appropriate to cores, selecting from the set of all 
possible cores those core molecules ^ling within the neighborhood distance of the 

10 selected core molecule; 

d. identifying all possible combinatorial product molecules which could result from the 
specified reactants and selected core molecules; 

e. selecting from all possible combinatorial product molecules a product molecule for 
inclusion in the subset; 

IS f. using a validated molecular descriptor appropriate to whole molecules with which the 
Virtual Library was goierated, removing from the set of all remaining molecules those 
molecules falling within a chosen neighborhood distance of the selected molecule; 

g. u^ng a validated molecular descriptor appropriate to the struaural variations with which 
the Virtual Library was generated, removing from the set of all remaining product 

20 molecules those molecules formed from structural variations fialling within a chosen 

neighborhood distance of the structural variations of the selected molecule; 

h. selecting from the set of all product molecules remaining after stq> g a product molecule 
for inclusion in the subset; 

i. repeating steps f through h until no additional product molecules remain to be selected 
25 in step b; and 

j. Outputting a list of the selected subset and/or the reactants from which the subset can 
be formed. 

72. The method of claim 1 further comprising a computer-based method for selecting 
from the virtual library, for all possible product molecules which could be created by all 
30 combinatorial arrangements of specified structural variations and a common core molecule, a 
subset of product molecules, comprising the following additional steps: 

b. idmtifying all possible combinatorial product molecules which could result from the 
specified reactants and selected core molecules; 
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c. selecting from all possible combinatorial product molecules a product molecule for 
inclusion in the subset; 

d. using a combination validated molecular descriptor characterizing both whole molecule 
and structural variation features with which the Virtual Library was genmted, removing 

S from the set of all remaining molecules those molecules falling within a chosen 

neighborhood distance of the selected molecule; 
e« selecting from the set of all product molecules remaining after step d a product molecule 
for inclusion in the subs^ 

f. repeating steps d through e until no additional product molecules remain to be selected 
10 in stq> e; and 

h. Outputting a list of the selected subset and/or the structural variations from which the 
subset can be formed. 

73. The method of claim 61 further comprising a computer-based method for selecting 
from the virtual library, for all possible product molecules which could be created by all 
IS combinatorial arrangements of specified structural variations and core moiecules, a subset of 
product molecules, comprising the following additional steps: 

b« selecting from all possible cores a core upon which to base the subset; 

c. using a validated molecular descriptor appropriate to cores, selecting from the set of all 
possible cores those core molecules falling within the neighborhood distance of the 

20 selected core molecule; 

d. identifying all possible combinatorial product molecules which could result from the 
q)ecified structural variations and selected core molecules; 

e. selecting from ail possible combinatorial product molecules a product molecule for 
inclusion in the subset; 

25 f. using a combination validated molecular descriptor characterizing both whole molecule 
and structural variation features with which the Virtual Library was generated, removing 
from the set of all remaining molecules those molecules falling within a chosen 
neighborhood distance of the sdected molecule; 

g. selecting from Uie set of all product molecules remaining after step e a product molecule 
30 for inclusion in the subset; 

f. repeating steps c through g until no additional product molecules remain to be selected 
in stq) g; and 

h. Outputting a list of the selected subset and/or Uie structural variations and cores from 
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which the subset can be formed. 
74. The molecules selected, from those which could be made in a combinatorial synthesis 
of specified reactants and common core, by the following computer-based method: 

a. generating a virtual library by: 

(1) . creating one or more files identifying one or more combinatorial reactions for one 

or more core structures; 

(2) . -creating separate structural variation files (associated with the reaction identifying 

fdes) in which are listed together the structural variations rq)res^tative of those 
reactants which will react at each variation site of each combinatorial reaction; 

(3) . associating vdth each structural variation* data, characterizing each structural 

variation including: 

(a) , characterization data, taking into account when necessary the structures of the 

cores with which the structural variations would be combined in the listed 
combinatorial syntheses, which has not been derived from the application of 
validated molecular structural descriptors; and 

(b) . characterizing data, taking into account when necessary the structures of the 

cores with which the structural variations would be combined in the listed 
combinatorial syntheses, which has been derived from ^plying validated 
molecular structural descriptors to the structural variations; 

b. identifying in the virtual library all possible combinatorial product molecules which 
could result from the specified reactants and core molecule; 

c. selecting from all possible combinatorial product molecules a product molecule for 
inclusion in the subset; 

d. using a combination validated molecular descriptor characteri^ng botii whole molecule 
and structural variation features with which the Virtual Library was generated, removing 
from the set of all remaining molecules those molecules falling within a chosen 
neighborhood distance of the sdected molecule; 

e. selecting from die set of all product molecules remaining after step d a product molecule 
for inclusion in the subset; 

f. repeating steps d through e until no additional product molecules remain to be selected 
in step e; and 

h. OutpuUing a list of the selected subset and/or the reactants from which the subset can 
be formed. 



wo 97/27559 PCT/US97/0I491 

671 

75. The molecules selected, from those which could be made in a combinatorial synthesis 
of specified reactants and cores, by the following computer-based method: 

a. generating a virtual library by: 

(1) . creating one or more files identifying one or more combinatorial reactions for one 

or more core structures; 

(2) . creating sqiarate structural variation files (associated with the reaction identifying 

files) in which are listed together the structural variations representative of those 
reactants which will react at each variation site of each combinatorial reaction; 

(3) . associating with each structural variation, data, charact^ang each structural 

variation including: 

(a) , characterization data, taking into account when necessary the structures of the 

cores with which the structural variations would be combined in the listed 
combinatorial syntheses, which has not been derived from the application of 
validated molecular structural descriptors; and • 

(b) . characterizing data, taking into account when necessary the structures of the 

cores with which the structural variations would be combined in the listed 
combinatorial syntheses, which has been derived from applying validated 
molecular structural descriptors to the structural variations; 

b. selecting from all possible cores a core upon which to base the subset; 

c. using a validated molecular descriptor syypropriate to cores, selecting from the set of all 
possible cores those core nnolecules falling within the neighborhood distance of the 
selected core molecule; 

d. identifying all possible combinatorial product molecules which could result from the 
specified reactants and selected core molecules; 

e. selecting from all possible combinatorial product molecules a product molecule for 
inclusion in the subset; 

f. using a combination validated molecular descriptor characterizing both whole molecule 
and structural variation features with which the Virtual Library was generated, removing 
from the set of all remaining molecules those molecules falling within a chosen 
neighbortiood distance of the selected molecule; 

g. selecting from the set of all product molecules remaining after step f a product molecule 
for inclusion in the subset; 

f. repeating steps f through g until no additional product molecules remain to be selected 
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in step g; and 

h. Outputting a list of the selected subset and/or the reactants and cores from which the 
subset can be formed. 

76. Tbc method of claim 61 further comprising a method of determining within the virtual 
5 library, the molecules which could be created by all combinatorial arrangements of specified 

structural variations and a common core molecule, which are most likely to have the same type 
of activity as a molecule of interest comprising the following steps: 

a. identif^ng in the virtual library all possible combinatorial product molecules which 
could result from the specified reactants and selected core molecules; 
10 b. characterizing the molecule of interest with a validated molecular structural descriptor 
appropriate to whole molecules with which the virtual library was generated; 
d. using the same validated molecular descriptor appropriate to whole molecules, selecting 
the set of all possible molecules whose descriptor values fall within a chosen 
neighbortiood distarice of the selected mdlecule; and 
IS g. Ouputting a list of the selected subset and/or the structural variations from which the 
subset can be formed. 

77. The method of claim 61 further comprising a method of detmnining within the virtual 
library, the molecules which could he created by all combinatorial arrangements of specified 
structural variations and a common core molecule, which are most likely to have the same type 

20 of activity as a molecule of interest comprising the following steps: 

a. identifying in the virtual library all possible combinatorial product molecules which 
could result from the specified reactants and selected core molecules; 

b. characterizing the molecule of interest with a validated molecular structural descriptor 
appropriate to structural variations with which ihc virtual library was generated; 

25 d. using the same validated molecular descriptor appropriate to structural variations, 
selecting the set of all possible molecules whose descriptor values fall within a chosen 
neighborhood distance of the sdected molecule; and 
g. Ouputting a list of the selected subset and/or the structural variations from which the 
subset can be formed. 

30 78. The method of claim 61 further comprising a method of determining within the virtual 
library, the molecules which could be created by all combinatorial arrangements of specified 
structural variations and a common core molecule, which are most likely to have the same type 
of activity as a molecule of interest comprising the following steps: 
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a. idoidfying in the virtual library all possible combinatorial product molecules which 
could result from the specified reactants and selected core molecules; 

b. characterizing the molecule of interest with both a validated molecular structural 
descriptor ^ropriate to structural variations with which the virtual library was 

S generated and with a validated molecular stru^ral descriptor qipropriate to structural 

variations with which the virtual library was generated; 

d. using the same validated molecular descriptor appropriate to whole molecules, selecting 
the set of all possible molecules whose descriptor values fall within a chosen 
neighborhood distance of the sdected molecule, and using the same validated molecular 

10 descriptor appropriate to structural variations, selecting the set of all possible molecules 

whose descriptor values fall within a chosen neighbortuxxl distance of the selected 
molecule; and 

e. Ouputting a list of the selected subset and/or the structural variations from which the 
subset can be formed. . 

15 79. The method of claim 61 further comprising a method of determining within the virtual 
library, the molecules which could be created by all combinatorial arrangements of specified 
structural variations and a common core molecule, which are most likely to have the same type 
of activity as a molecule of interest comprising the following steps: 

a. identifying in the virtual library all possible combinatorial product molecules which 
20 OMild result from the specified reactants and selected core molecules; 

b- charact^zing the molecule of interest with a combination validated molecular 
descriptor, characterizing both whole molecule and structural variation features, with 
which the Virtual Library was generated; 
d. using the same validated molecular descriptor, selecting the set of all possible molecules 
25 whose descriptor values fall within a chosen neighborhood distance of the selected 

molecule; and 

g. Ouputting a list of the selected subset and/or the structural variations from which the 
subset can be formed. 

80. The molecules, which are most likely to have the same type of activity as a molecule 
30 of interest, selected, from those which could be made in a combinatorial synthesis from 
jqpecified reactants and a common core molecule, by the following computer-based method: 
a. generating a virtual library by: 

(1). creating one or more files identifying one or more combinatorial reactions for one 
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or more core structures; 
(2). creating sqiarate structural variation files (associated with the reaction identifying 
files) in which are listed together the structural variations representative of those 
reactants which will react at each variation site of each combinatorial reaction; 
S (3). associating with each structural variation, data, characterizing each structural 

variation induding: 

(a) , diaracterization data, taking into account when necessary the structures of the 

cores with which the structural variations would be combined in the listed 
combinatorial syntheses, which has not been derived from the plication of 
10 validated molecular structural descriptors; and 

(b) . characterimg data, taking into account whoi necessary the structures of the 

cores with which the structural variations would be combined in the listed 
combinatorial syntheses, which has been derived from applying validated 
' molecular structural descriptors to the structural variations; 
IS b. identifying in the virtual library all possible combinatorial product molecules which 
could result from the specified reactants and selected core molecules; 

c. characterizing the molecule of interest with both a validated molecular structural 
descriptor appropriate to structural variations with which die virtual library was 
generated and with a validated molecular structural descriptor appropriate to structural 

20 variations with which the virtual library was generated; 

d. using the same validated molecular descriptor appropriate to whole molecules, selecting 
the set of all possible molecules whose descriptor values fall within a chosen 
neighborhood distance of the selected molecule, and using the same validated molecular 
descriptor appropriate to structural variations, selecting the set of all possible molecules 

25 whose descriptor values fall within a chosen neighborhood distance of the selected 

molecule; and 

e. Ouputting a list of the selected subset and/or the reactants from which the subset can be 
formed. 

81. The molecules, which are most likely to have the same type of activity as a molecule 
30 of interest, selected, from those which could be made in a combinatorial synthesis from 
specified reactants and a common core molecule, by the following computer-based method: 
a. generating a virtual library by: 

(1). creating one or more files identifying one or more combinatorial reactions for one 
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or more core structures; 

(2) , creating separate structural variation files (associated with the reaction identifying 

files) in which are listed together the structural variations representative of those 
reactants which will react at each variation site of each combinatorial reaction; 

(3) . associating with each structural variation, data, characterizing each structural 

variation including: 

(a) , characterization data, taking into account when necessary the structures of the 

cores with which the structural variations would be combined in the listed 
combinatorial syntheses, which has not been derived from the ^plication of 
validated molecular structural descriptors; and 

(b) . characterizing data, taking into account when necessary the structures of the 

cores with which the structural variations would be combined in the listed 
combinatorial syntheses, which has been derived from applying validated 
molecular structural descriptors to the structural variations; 

b. identifying in the virtual library all possible combinatorial product molecules which 
could result from the specified reactants and selected core molecules; 

c. charactering the molecule of interest with a combination validated molecular 
descriptor, characterizing both whole molecule and structural variation features, with 
which the Virtual Library was g^erated; 

d. using the same validated molecular descriptor, selecting the set of all possible molecules 
whose descriptor values fall within a chosen neighborhood distance of the selected 
molecule; and 

e. Ouputting a list of the selected subset and/or the reactant from which the subset of 
molecules can be formed. 

82. The use of a subset of molecules, which are most likely to have the same type of 
activity as a molecule of interest and selected from tiiose which could be made in a 
combinatorial synthesis from specified reactants and a common core molecule, to specify the 
compounds to be synthesized and tested in appropriate assays, said subset being selected by 
the following computer-based metiiod: 

a. generating a virtual library by: 

(1) , creating one or more files identifying one or more combinatorial reactions for one 

or more core structures; 

(2) . creating separate structural variation files (associated with the ruction identifying 
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files) in which are listed together the structural variations rqiresratative of those 
reactants which will react at each variation site of each combinatorial reaction; 
(3). associating with each structural variation, data, characterizing each structural 
variation including: 

5 (a), characterization data, taking into account when necessary the structures of the 

cores with which the structural variations would be combined in the listed 
combinatorial syntheses, which-has not been derived from the application of 
validated molecular structural descriptors; and 
(b). diaracterizing data, taking into account when necessary the structures of the 
10 cores with which the structural variations would be combined in the listed 

combinatorial syntheses, which has been derived from aj^lying validated 
molecular structural descriptors to the structural variations; 
b. identifying in the virtual library all possible combinatorial product molecules which 
could result from the specified reactants and selected core molecules; 
IS c. selecting from all possible combinatorial product molecules a product molecule for 
inclusion in the subset; 

d. characterizing the molecule of interest with both a validated molecular structural 
descriptor appropriate to whole molecules with which the virtual library was generated 
and with a validated molecular structural descriptor appropriate to structural variations 

20 with .which the virtual library was generated; 

e. using the same validated molecular descriptor appropriate to whole molecules, selecting 
the set of all possible molecules whose descriptor values fall within a chosen 
ndghborhood distance of the selected molecule, and using the same validated molecular 
descriptor appropriate to structural variations, selecting the set of all possible molecules 

25 whose descriptor values fall within a chosen neighborhood distance of the selected 

molecule; and 

f. Ouputting a list of the selected subset and/or the reactants from which the subset can be 
formed. 

83. The use of a subset of molecules, which are most likely to have the same type of 
30 activity as a molecule of interest and selected from those which could be made in a 
combinatorial synthesis from specified reactants and a common core molecule, to specify the 
compounds to be synthesized and tested in appropriate assays, said subset being selected by 
the following computer-based method: 
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a. generating a virtual library by: 

(1) . creating one or more files identifying one or more combinatorial reactions for one 

or more core structures; 

(2) , creating separate structural variation files (associated witfi the reaction identifying 
5 files) in which are listed togeUier the structural variations representative of those 

reactants which will react at each variation site of each combinatorial reaction; 

(3) . associating with each structural variation, data, characterizing each structural 

variation including: 

(a) , characterization data, taking into account when necessary the structures of the 

witii which tiie structural variations would be combined in the listed 
combinatorial syntheses, which has not been derived from the applicati<m of 
validated molecular structural descriptors; and 

(b) . characterizing data, taking into account when necessary tiie structures of the 

cores witii which the structural variations would be combined in tiie listed 
combinatorial syntiieses, which has been derived from applying validated 
molecular structural descriptors to Uie structural variations; 

b. identifying in tfie virtual library all possible combinatorial product molecules which 
could result from die specified reactants and selected core molecules; 

c. selecting from all possible combinatorial product molecules a product molecule for 
20 inclusion in the subset; 

d. characterizing Uie molecule of intoest witfi a combination validated molecular 
descriptor, characterizing botii whole molecule and structural variation features, with 
which the Virtual Library was generated; 

e. using tiie same validated molecular descriptor, selecting tiie set of all possible molecules 
whose descriptor values fall witiiin a chosen neighborhood distance of tiie selected 
molecule; and 

f. Ouputting a list of tiie selected subset and/or tiie reactant from which tiie subset of 
molecules can be formed. 

84. The metiiod of claim 61 further comprising a metiiod of determining within tiie virtual 
library, the molecules which could be created by all combinatorial arrangements of specified 
structural variations and core molecules, which are most likely to have tfie same type of 
activity as a molecule of interest, comprising tiie following steps: 

a. selecting from all possible cores a core upon which to base tiie subset; 
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b. using a validated molecular descriptor appropriate to cores, selecting from the set of all 
possible cores those core molecules falling within the neighborhood distance of the 
selected core molecule; 

c. identifying all possible combinatorial product molecules which could result from the 
S specified reactants and selected core molecules; 

d. selecting and characterizing the molecule of interest with a validated molecular structural 
descriptor-appropriate to whole molecules with which the virtual library was generated; 

e. using the same validated molecular descriptor appropriate to whole molecules, sdecting 
the set of all possible molecules whose descriptor values fall within a chosen 

10 ndghboriiood distance of the selected molecule; and 

f. Ouputting a list of the selected subset and/or the structural variations from which the 
subset can be formed. 

85 . The method of claim 6 1 further comprising a method of determining within the virtual 
library, the molecules which could be created by all combinatorial arrangements of structural 
15 variations and core molecules, which are most likely to have the same type of activity as a 
molecule of interest, which is not known to be derived from a combinatorial reaction, 
comprising the following stq>s: 

a. fragmenting the molecule of interest as described in a fragmentation table; 

b. selecting a fragmentation pattern; 

20 c. aligning tiie fragments according to topomeric alignment rules; 

d. generating CoMFA fields for each aligned fragment; 

e. identifying which reaction types within the virtual library correspond to the reaction type 
resulting from the fragmentation; 

f. identifying whetiier the fragmentation pattern generated a core, and, if so, implementing 

25 the following steps: 

(1) characterizing the core with CoMFA fields; and 

(2) identifying, by comparing the field values, wheUier the core resembles any cores 
used in the creation of the virtual library; 

g. selecting structural variations which were used in generating the virtual library with 
30 cores which matched the core resulting from the fragmentation; 

h. comparing the CoMFA fields of the topomencally aligned fragments with the fields of 
die identified structural variations by taking the root sum of squares field differences; 

i. selecting those structural variations for which the root sum of squares field difference 
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falls within a ch sen ndghborhood value; 
j. ouputting a list f the selected subset and/or the structural variations from which the 

subset can be forme; 
k. repeating steps b through j for all possible fragm^ts. 

86. The molecules, M^ich are most likdy to have the same type of activity as a molecule 
of interest which is not known to be derived from a combinatorial reaction, selected from those 
product molecules which could be created by all combinatorial arrangements of structural 
variations and core molecules, by the following computer-based method: 

a. generating a virtual libraiy by: 

(1) . creating one or more files identifying one or more combinatorial reactions for one 

or more core structures; 

(2) . creating separate structural variation files (associated with the reaction identifying 

files) in which are listed together the structural variations representative of those 
reactants \vhich will react at each variation site of each combinatorial reaction; 

(3) . associating with each structural variation, data, characterizing each structural 

variation including: 

(a) , characterization data, taking into account when necessary the structures of the 

cores with which the structural variations would be combined in the listed 
combinatorial syntheses, which has not been derived from the application of 
validated molecular structural descriptors; and 

(b) , characterizing data, taking into account when necessary the structures of the 

* 

cores with which the structural variations would be combined in the listed 
combinatorial syntheses, which has been derived from applying validated 
molecular structural descriptors to the structural variations; 

b. fragmenting the niolecule of interest as described in a fragmentation table; 

c. selecting a fragmentation pauem; 

d. aligning the fragments according to topomeric alignment rules; 

e. generating CoMFA fields for each aligned fragment; 

f. identifying which reaction types within the virtual library correspond to the reaction type 
resulting from the fragm^tation; 

g. identifying whether the fragmentation pattern generated a core, and, if so, implementing 
the following steps: 

(1) characterizing the core with CoMFA fields; and 
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(2) identifying, by comparing the field values, whether the core resembles any cores 
used in the creation of the virtual library; 

h. selecting structural variations which were used in generating the virtual library with 
cores which matched the core resulting from the fragmentation; 

i. comparing the CoMFA fields of the topomerically aligned fragments with the fields of 
the identified striictural variaticms by taldng the root sum of squares field differences; 

j. selecting those structural variations for which the root sum of squares field difference 

falls within a chosen neighborhood value; 
k. ouputting a list of the selected subset and/or the structural variations from which the 

subset can be forme; 
1. rq)eating steps c through k for all possible fragments. 

87, The method of claims 63 or 65 or 69 or 71 or 72 or 73 or 74 or 75 or 80 or 86 or 
88 in which the following additional stq> is performed immediately after the step of using a 
validated molecular descriptor appropriate to whole molecules: 

t. repeating the previous step for another validated molecular descriptor ^propriate to 
whole molecules with which the Virtual Library was generated until no additional 
whole molecule descriptor remains to be used. 

88, The method of claims 63 or 65 or 70 or 71 or 72 or 73 or 74 or 75 or 81 or 86 in 
which the following additional step is performed immediately after the step of using a validated 
molecular descriptor appropriate to structural variations: 

' u. repeating the previous step for another validated molecular descriptor appropriate to 
structural variations with which the Virtual Library was generated until no additional 
structural variation descriptor remains to be used. 

89, The method of claim 63 in which the additional step i is performed immediately after 
the step of using a validated molecular descriptor appropriate to whole molecules and further 
in which stq> u is performed immediatdy after the step of using a validated molecular 
descriptor appropriate to structural variations: 

t. repeating the previous step for another validated molecular descriptor appropriate to 

whole molecules with which the Virtual Library was generated until no additional 

whole molecule descriptor remains to be used; and 
u. rq)eating the previous step for another validated molecular descriptor q>propriate to 

structural variations with which the Virtual Library was generated until no additional 

structural variation descriptor remains to be used. 
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90. The method of claims 61 or 63 r 65 or 70 or 71 or 72 or 73 or 74 or 86 in which 
tfie validated molecular structural descriptor appropriate to structural variations is topomeric 
CoMFA fields, 

9L The method of claim 61 or 63 or 65 or 70 or 71 or 72 or 73 or 74 or 86 in which 
topomeric hydrogen bond fields are used in conjunction with the topomeric CoMFA fields 
descriptor. 

92. The method of claims 63 or 65 or 69 or 71 or 72 or 73 or 74 or 75 or 80 or 86 or 
88 in which the validated molecular structural descriptor apprc^riate to whde molecules is the 
Tanimoto 2D coefficient. 

93. The method of claim 63 in which after stqi g product molecules with the following 
characteristics are removed from further use in the method: 

a. toxic reactant molecules; 

b. reactant molecules containing metals, improper forms of tautomers, and interfering 
chemical groups; 

c. reactant molecules with too low a bioavailability; 

d. reactant molecules not likely to cross membranes; and 

e. reactant molecules containing biologically non-relevant groups. 

94. The method of claim 63 in which after step g product molecules with the following 
characteristics are removed from further use in the method: 

a. product molecules having MW > 750; and 

b. product molecules not having a CLXXiP between -2 and 7.5. 

95. The methods of selecting screening libraries as disclosed in this invention. 

96. The systems for selecting screening libraries as disclosed in this invention. 

97. The screening libraries selected by the methods or systems disclosed in this invention, 

98. The metric validation method as disclosed in this invention. 

99. The method of merging libraries as disclosed in this invention. 

100. The method of lead explosion as disclosed in this invention. 

101. The methods of molecular alignment as disclosed in this invention. 

102. The new molecular structural descriptors as disclosed in this invention. 

103. The methods of generating a virtual library as disclosed in this invention. 

104. The methods of searching a virtual library as disclosed in this invention. 

105. The virtual library as disclosed in this invention. 
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